Spaces:

ner4archives
/

NER4Archives-analytics

Sleeping

App Files Files Community

lterriel commited on Sep 13, 2022

Commit

519b419

•

1 Parent(s): 73f630f

add app

Browse files

Files changed (16) hide show

app.py +325 -0
datatest/curation_rapid_global.zip +3 -0
datatest/doc_baseline_IAA_4.txt +53 -0
datatest/ner4archives-template_curated_documents_2022-09-05_1345.zip +3 -0
datatest/test.zip +3 -0
n4a_analytics_lib/__init__.py +0 -0
n4a_analytics_lib/__pycache__/__init__.cpython-38.pyc +0 -0
n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc +0 -0
n4a_analytics_lib/__pycache__/metrics_utils.cpython-38.pyc +0 -0
n4a_analytics_lib/__pycache__/project.cpython-38.pyc +0 -0
n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc +0 -0
n4a_analytics_lib/analytics.py +75 -0
n4a_analytics_lib/metrics_utils.py +68 -0
n4a_analytics_lib/project.py +102 -0
n4a_analytics_lib/st_components.py +22 -0
requirements.txt +139 -0

app.py ADDED Viewed

	@@ -0,0 +1,325 @@

+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+import streamlit as st
+from n4a_analytics_lib.analytics import (GlobalStatistics, IaaStatistics)
+TITLE = "NER4ARCHIVES Analytics"
+# Set application
+st.set_page_config(layout="wide")
+# sidebar: meta, inputs etc.
+sidebar = st.sidebar
+# cols: display results
+col1, col2 = st.columns(2)
+# description
+#sidebar.markdown(f"# 📏 {TITLE}")
+sidebar.markdown(f"""
+# 📏 {TITLE}
+A basic web application to display a dashboard for
+analyzing INCEpTION annotation project built in context
+of NER4Archives (Inria/Archives nationales).
+- This tool provides two statistics levels:
+    - *Global project statistics*: Analyze named entities in overall curated documents in project;
+    - *Inter-Annotator Agreement results*: Analyze results of IAA experiment.
+""")
+# Level to analyze
+option = sidebar.selectbox('Which statistics level?', ('Inter-Annotator Agreement results', 'Global project statistics'))
+# IAA results view
+if option == "Inter-Annotator Agreement results":
+    annotations = sidebar.file_uploader("Upload IAA annotations (.zip format only): ")
+    baseline_text = sidebar.file_uploader("Upload baseline text (.txt format only): ")
+    if baseline_text is not None and annotations is not None:
+        project_analyzed = IaaStatistics(zip_project=annotations, baseline_text=baseline_text.getvalue())
+        baseline_analyzer = project_analyzed.analyze_text()
+        col2.markdown(f"""
+        ### BASELINE TEXT: {baseline_text.name}
+         - sentences:  {baseline_analyzer[0]}
+         - words: {baseline_analyzer[1]}
+         - characters: {baseline_analyzer[2]}
+        """)
+        #print(project_analyzed.annotations_per_coders)
+        commune_mentions = [l for i,j in project_analyzed.mentions_per_coder.items() for l in j]
+        commune_mentions = list(dict.fromkeys(commune_mentions))
+        #print(commune_mentions)
+        #print(project_analyzed.annotations)
+        #print(project_analyzed.labels_per_coder)
+        import pandas as pd
+        from collections import defaultdict, Counter
+        from itertools import combinations
+        import seaborn as sn
+        import matplotlib as plt
+        import matplotlib.pyplot as pylt
+        dicts_coders = []
+        for coder, annotations in project_analyzed.annotations_per_coders.items():
+            nombre_annotations = []
+            # print(f'* {coder}')
+            for annotation, label in annotations.items():
+                nombre_annotations.append(label)
+            # print(f"Nombre total d'annotations : {len(nombre_annotations)}")
+            dict_coder = dict(Counter(nombre_annotations))
+            dicts_coders.append(dict_coder)
+            # print(f'==========================')
+        labels = [label for label in dicts_coders[0]]
+        from n4a_analytics_lib.metrics_utils import interpret_kappa, fleiss_kappa_function, cohen_kappa_function
+        df = pd.DataFrame(project_analyzed.annotations_per_coders, index=commune_mentions)
+        for ann in project_analyzed.annotators:
+            df[ann] = 'None'
+            for mention, value in project_analyzed.annotations_per_coders[ann].items():
+                df.loc[mention, ann] = value
+        total_annotations = len(df)
+        # print(f'* Total des annotations : {total_annotations}')
+        df_n = df.apply(pd.Series.value_counts, 1).fillna(0).astype(int)
+        matrix = df_n.values
+        pairs = list(combinations(project_analyzed.annotations_per_coders, 2))
+        # Display in app
+        cont_kappa = st.container()
+        cont_kappa.title("Inter-Annotator Agreement (IAA) results")
+        tab1, tab2, tab3, tab4, tab5 = cont_kappa.tabs(
+            ["📈 IAA metrics", "🗃 IAA Metrics Legend", "✔️ Agree annotations", "❌ Disagree annotations",
+             "🏷️ Global Labels Statistics"])
+        tab1.subheader("Fleiss Kappa (global score for group):")
+        tab1.markdown(interpret_kappa(round(fleiss_kappa_function(matrix), 2)), unsafe_allow_html=True)
+        tab1.subheader("Cohen Kappa Annotators Matrix (score between annotators):")
+        # tab1.dataframe(df)
+        data = []
+        for coder_1, coder_2 in pairs:
+            cohen_function = cohen_kappa_function(project_analyzed.labels_per_coder[coder_1], project_analyzed.labels_per_coder[coder_2])
+            data.append(((coder_1, coder_2), cohen_function))
+            tab1.markdown(f"* {coder_1} <> {coder_2} : {interpret_kappa(cohen_function)}", unsafe_allow_html=True)
+            # print(f"* {coder_1} <> {coder_2} : {cohen_function}")
+        intermediary = defaultdict(Counter)
+        for (src, tgt), count in data:
+            intermediary[src][tgt] = count
+        letters = sorted({key for inner in intermediary.values() for key in inner} | set(intermediary.keys()))
+        confusion_matrix = [[intermediary[src][tgt] for tgt in letters] for src in letters]
+        import numpy as np
+        df_cm = pd.DataFrame(confusion_matrix, letters, letters)
+        mask = df_cm.values == 0
+        sn.set(font_scale=0.7)  # for label size
+        colors = ["#e74c3c", "#f39c12", "#f4d03f", "#5dade2", "#58d68d", "#28b463"]
+        width = tab1.slider("matrix width", 1, 10, 14)
+        height = tab1.slider("matrix height", 1, 10, 4)
+        fig, ax = pylt.subplots(figsize=(width, height))
+        sn.heatmap(df_cm, cmap=colors, annot=True, mask=mask, annot_kws={"size": 7}, vmin=0, vmax=1, ax=ax)  # font size
+        # plt.show()
+        tab1.pyplot(ax.figure)
+        tab2.markdown("""
+        <table>
+        <thead>
+        <tr>
+        <th
+        colspan="2"> Kappa
+        interpretation
+        legend </th>
+                   </tr>
+                       </thead>
+                           <tbody>
+                           <tr>
+                           <td> Kappa
+        score(k) </td>
+                     <td>Agreement</td>
+                                          </tr>
+                                              <tr
+        style = "background-color: #e74c3c;">
+                <td> k < 0 </td>
+                                 <td> Less
+        chance
+        agreement </td>
+                      </tr>
+                          <tr
+        style = "background-color: #f39c12;">
+                <td> 0.01 < k < 0.20 </td>
+                                           <td> Slight
+        agreement </td>
+                      </tr>
+                          <tr
+        style = "background-color: #f4d03f;">
+                <td> 0.21 < k < 0.40 </td>
+                                           <td> Fair
+        agreement </td>
+                      </tr>
+                          <tr
+        style = "background-color:  #5dade2;">
+                <td> 0.41 < k < 0.60 </td>
+                                           <td> Moderate
+        agreement </td>
+                      </tr>
+                          <tr
+        style = "background-color:  #58d68d;">
+                <td> 0.61 < k < 0.80 </td>
+                                           <td> Substantial
+        agreement </td>
+                      </tr>
+                          <tr
+        style = "background-color:  #28b463;">
+                <td> 0.81 < k < 0.99 </td>
+                                           <td> Almost
+        perfect
+        agreement </td>
+                      </tr>
+                          </tbody>
+                              </table>"""
+        , unsafe_allow_html = True)
+        ## commune
+        @st.cache
+        def convert_df(df_ex):
+            return df_ex.to_csv(encoding="utf-8").encode('utf-8')
+        ## Agree part
+        columns_to_compare = project_analyzed.annotators
+        def check_all_equal(iterator):
+            return len(set(iterator)) <= 1
+        df_agree = df[df[columns_to_compare].apply(lambda row: check_all_equal(row), axis=1)]
+        total_unanime = len(df_agree)
+        csv_agree = convert_df(df_agree)
+        tab3.subheader("Total agree annotations:")
+        tab3.markdown(f"{total_unanime} / {len(df)} annotations ({round((total_unanime / len(df)) * 100, 2)} %)")
+        tab3.download_button(
+            "Press to Download CSV",
+            csv_agree,
+            "csv_annotators_agree.csv",
+            "text/csv",
+            key='download-csv-1'
+        )
+        tab3.dataframe(df_agree)
+        ## Disagree part
+        def check_all_not_equal(iterator):
+            return len(set(iterator)) > 1
+        df_disagree = df[df[columns_to_compare].apply(lambda row: check_all_not_equal(row), axis=1)]
+        total_desaccord = len(df_disagree)
+        csv_disagree = convert_df(df_disagree)
+        tab4.subheader("Total disagree annotations:")
+        tab4.markdown(
+            f"{total_desaccord} / {len(df)} annotations ({round((total_desaccord / len(df)) * 100, 2)} %)")
+        tab4.download_button(
+            "Press to Download CSV",
+            csv_disagree,
+            "csv_annotators_disagree.csv",
+            "text/csv",
+            key='download-csv-2'
+        )
+        tab4.dataframe(df_disagree)
+        ## alignement chart labels
+        def count_total_annotations_label(dataframe, labels):
+            pairs = []
+            for label in labels:
+                total = dataframe.astype(object).eq(label).any(1).sum()
+                pairs.append((label, total))
+            return pairs
+        totals_annotations_per_labels = count_total_annotations_label(df, labels)
+        # Récupérer le nombre de mention portant la même classe selon les annotateurs
+        def total_agree_disagree_per_label(dataframe, pairs_totals_labels):
+            new_pairs = []
+            for t in pairs_totals_labels:
+                # t[0] : label
+                # t[1] : total_rows_with_label
+                agree_res = df[df.nunique(1).eq(1)].eq(t[0]).any(1).sum()
+                disagree_res = t[1] - agree_res
+                agree_percent = (agree_res / t[1]) * 100
+                disagree_percent = (disagree_res / t[1]) * 100
+                new_pairs.append((t[0], t[1], agree_percent, disagree_percent))
+            return new_pairs
+        to_pie = total_agree_disagree_per_label(df, totals_annotations_per_labels)
+        def plot_pies(tasks_to_pie):
+         my_labels = 'agree', 'disagree'
+         my_colors = ['#47DBCD', '#F5B14C']
+         my_explode = (0, 0.1)
+         counter = 0
+         fig, axes = pylt.subplots(1, len(tasks_to_pie), figsize=(20, 3))
+         for t in tasks_to_pie:
+             tasks = [t[2], t[3]]
+             axes[counter].pie(tasks, autopct='%1.1f%%', startangle=15, shadow=True, colors=my_colors,
+                               explode=my_explode)
+             axes[counter].set_title(t[0])
+             axes[counter].axis('equal')
+             counter += 1
+         fig.set_facecolor("white")
+         fig.legend(labels=my_labels, loc="center right", borderaxespad=0.1, title="Labels alignement")
+         # plt.savefig(f'./out/pie_alignement_labels_{filename_no_extension}.png', dpi=400)
+         return fig
+        f = plot_pies(to_pie)
+        tab5.pyplot(f.figure)
+# global project results view
+# to st components
+def clear_cache():
+    st.session_state["p_a"] = None
+if option == "Global project statistics":
+    project = sidebar.file_uploader("Project folder that contains curated annotations in XMI 1.1 (.zip format only) : ", on_change=clear_cache)
+    if project is not None:
+        if st.session_state["p_a"] is None:
+            st.session_state["p_a"] = GlobalStatistics(zip_project=project)
+        if st.session_state["p_a"] is not None:
+            with st.expander('Details on data'):
+                col1.metric("Total curated annotations",
+                            f"{st.session_state['p_a'].total_annotations_project} Named entities")
+                col1.dataframe(st.session_state['p_a'].df_i)
+                selected_data = col1.selectbox('Select specific data to display bar plot:',
+                                               st.session_state['p_a'].documents)
+                col2.pyplot(st.session_state['p_a'].create_plot(selected_data))

datatest/curation_rapid_global.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:804a01b2ffae53103cd67fa51671ccbbbc988cf2796ec40ccb20f1e9283c1b47
+size 4670583

datatest/doc_baseline_IAA_4.txt ADDED Viewed

	@@ -0,0 +1,53 @@

+Les Abymes, Association pour l'utilisation du Rein Artificiel (AUDRA) (code démarche : 6173), 2010-2011 ; Centre hospitalier gérontologique du Raizet (code démarche : 6182), 2006-2010 ; Polyclinique de La Guadeloupe (code démarche : 5838), 2008-2011 Pointe-à-Pitre, Centre hospitalier universitaire (CHU) Pointe-à-Pitre - Abymes (code démarche : 5777), 2010.
+74-734, cliché n°390, matricule n°23 : vue d'ensemble du torrent de Saint-Martin[-Sallanches] depuis les bords de l'Arve (KUSS, 4 ex, 1890) 74-735, cliché n°391, matricule n°24 : vue d'ensemble du torrent de Saint-Martin[-Sallanches] depuis les bords de l'Arve (KUSS, 1 ex, 1890).
+« Établissements Barbaud, boucherie hippophagique, fabrique de saucissons et salaisons » : projets de statuts constitué par Fernand Édouard Marius Barbaud, Alice Blanche Barbaud et Pierre Roger Robert Barbaud ; statuts définitifs ; état des clients débiteurs et inventaire au 3 mars 1938 ; autre inventaire au 31 décembre 1938 ; estimation d’une propriété au 148-150 rue Castagnary, 15e arrondissement, dépendant de la succession de François Barbaud ; notes, comptes, baux, plans, correspondance.
+MC/DC/CXX/103 « La Gazette du Franc, revue hebdomadaire d’information et d’économie politique et financière » 1930-1931.
+1PH/413 Neufchef à Rimling.
+1PH/414 Rohrbach-lès-Bitche à Saint-Louis.
+1PH/415 Saint-Louis-lès-Bitche à Zoufftgen.
+20100562/512 Reportages 4305 Obsèques du pape Jean Paul II au Vatican 08/04/2005.
+20100562/513 Reportages 4306 Entretien avec DRISS JETTOU Premier ministre du MAROC 09/04/2005.
+PA_324 Documents concernant Henri Lambert (1897-1958), appelé en janvier 1916, affecté au 27e RI, déserteur le 5 mars 1916, emprisonné à Lons-le-Saunier, affecté ensuite au 29e RI, blessé et affecté au 106e RI 1919-2004.
+Louis Veuillot (une lettre à Mgr de Salinis, relatant une entrevue avec Napoléon III, 1858, transmise au prince par le baron Durrien en 1902) ; Georges Vicaire ; colonel Villot.
+CP/F/14/17531/19, pièce 128 "First Order Light House for Cape Henry, VA Plate 2" [Phare de premier ordre du Cap Henry, Virginie Planche 2] Dressé par Peter C HAINS, ingénieur secrétaire Mai 1879.
+BB/29/1082  page 116  La Ferté-sous-Jouarre (Seine-et-Marne).
+D'azur semé de fleurs de lys d'or, au lion passant du même.
+BB/29/1082  page 117  Pont-Audemer (Eure).
+N/III/Jura/12 « PLAN DES TERRES VAINES ET VAGUES SCITUEZ DERRIÈRE LES CAPUSINS DE LA VILLE DE DOLE » Jean-Pierre Baronnet, géomètre arpenteur royal à Besançon, 1775.
+Pl d'une partie de la forêt de Chaux; lieux dits ; grand chemin conduisant à la Vieille Loye, chemin de la Grande Loye; baraques des Tuileries, broussailles Notice 1280.
+MC/ET/VII/814 21 Règlement entre Désirée Anne Radigue, veuve de Bajamin dit Benjamin Bourdin, ayant demeuré route Stratégique, au coin de la rue Saint-Fargeau, propriétaire, demeurant 71, rue de Rochechouart, actuellement à Choisy-le-Roi (Val-de-Marne), aux héritiers de son mari, Claude Eustache Bourdin, Marie Claudine Bourdin, Pierre Louis Bourdin, les époux Chalbot 28 juin 1861.
+RAPATEL, de Rennes, colonel d'artillerie en retraite Demande d'avancement pour son neveu, maréchal des logis 1855.
+Projets de lois de réorganisation, 1945-1947 Effectifs de l'année 1946  Programmes annuel de tournée : année 1945 (missions effectuées pour le Ministère de la Justice) ; année 1949 (contrôle des services départementaux dépendant du Ministère de l'Intérieur) Méthodes de travail: exploitation des rapports des inspecteurs de l'IGA Ecole nationale d'Administration (ENA): projet de stage des élèves de l'ENA dans les préfectures 1946, organisation de l'examen de classement et résultats généraux du concours de l'année 1948 (promotion Croix de Lorraine) et de l'année 1949 (promotion Quarante-huit).
+Z/1j/257 Estimation des réparations à faire en une maison, rue du Plâtre, acquise par François Mansart de Guy Arbaleste, vicomte de Melun, et de Marie de Montmorency, sa femme 11 mars 1639.
+Z/1j/258 Estimation des réparations à faire en une maison, rue du Plâtre, appartenant à Richard Le Gras et louée à Guillaume Pierre 8 août 1639.
+TAILHADE, Laurent Encouragements, indemnités et secours, 12 juillet 1918 - 13 mai 1925 F/21/4154.
+TALBOT-READY, 5 août 1887 - 8 mai 1908 F/21/4455.
+TANO, Nicolas, 28 mars 1890 - 19 novembre 1937 F/21/4455.
+1 Tourtour, Var, arr de Draguignan, cant de Salernes.
+106 Blois Confirmation des privilèges des doyen et chapitre de l'église Saint-Vulfran d'Abbeville (Fol 18, n° 106) Novembre 1501.

datatest/ner4archives-template_curated_documents_2022-09-05_1345.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:904667ecd2393a040a53d1b99137d123b1e98591cff71b7ae21ed91b710aa180
+size 35874822

datatest/test.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab5ced7fa96a8b65ad8077d69761f517b19a57d8ec74e86608101d3bb66c6a54
+size 74199

n4a_analytics_lib/__init__.py ADDED Viewed

File without changes

n4a_analytics_lib/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (212 Bytes). View file

n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc ADDED Viewed

Binary file (3.2 kB). View file

n4a_analytics_lib/__pycache__/metrics_utils.cpython-38.pyc ADDED Viewed

Binary file (2.2 kB). View file

n4a_analytics_lib/__pycache__/project.cpython-38.pyc ADDED Viewed

Binary file (2.32 kB). View file

n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc ADDED Viewed

Binary file (807 Bytes). View file

n4a_analytics_lib/analytics.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# -*- coding:utf-8 -*-
+import pandas as pd
+import seaborn as sns
+import matplotlib
+matplotlib.use('Agg')
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import sent_tokenize, word_tokenize
+from n4a_analytics_lib.project import Project
+class GlobalStatistics(Project):
+    def __init__(self, zip_project):
+        super().__init__(zip_project=zip_project, type="global")
+        self.data = [(src_file, ne_label) for src_file, ann in self.annotations.items() for ne_label in ann['labels']]
+        self.df_base = pd.DataFrame(self.data, columns=["SOURCE_FILE", "LABEL"])
+        self.df_i = self.df_base.groupby(["LABEL"])["LABEL"].count().reset_index(name="TOTAL")
+        self.df_details = self.df_base.groupby(["SOURCE_FILE", "LABEL"])["LABEL"].count().reset_index(name="TOTAL")
+        self.total_annotations_project = self.df_i['TOTAL'].sum()
+    def create_plot(self, type_data):
+        # apply data filter
+        data_tab_filtered = self.df_details.loc[self.df_details['SOURCE_FILE'] == type_data]
+        # create a new plot
+        ax = sns.barplot(x='LABEL', y='TOTAL', data=data_tab_filtered)
+        # add title to plot
+        ax.figure.suptitle(type_data)
+        # add value labels to bars
+        for container in ax.containers:
+            ax.bar_label(container)
+        return ax.figure
+class IaaStatistics(Project):
+    def __init__(self, zip_project, baseline_text):
+        super().__init__(zip_project=zip_project, type="iaa")
+        self.baseline_text = baseline_text.decode('utf-8')
+        # self.docs = {}
+        # self.pairwise = {}
+        # self.similar_mention = []
+        self.mentions_per_coder = self.extract_refs(self.annotations, self.annotators, type="mentions")
+        self.labels_per_coder = self.extract_refs(self.annotations, self.annotators, type="labels")
+        self.annotations_per_coders = {coder: dict(zip(ann[1]['mentions'], ann[1]['labels'])) for coder, ann in zip(self.annotators, self.annotations.items())}
+    @staticmethod
+    def extract_refs(annotations, annotators, type):
+        return {
+            coder: data for coder, ann in zip(
+                annotators,
+                annotations.items()
+            ) for ref, data in ann[1].items() if ref == type
+        }
+    def analyze_text(self):
+        """returns total sentences, words and characters
+        in list format
+        """
+        return [
+            len(sent_tokenize(self.baseline_text, language="french")),
+            len(word_tokenize(self.baseline_text, language="french")),
+            len(self.baseline_text)
+        ]

n4a_analytics_lib/metrics_utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# -*- coding:utf-8 -*-
+import numpy as np
+def fleiss_kappa_function(M):
+    """Computes Fleiss' kappa for group of annotators.
+    :param M: a matrix of shape (:attr:'N', :attr:'k') with 'N' = number of subjects and 'k' = the number of categories.
+        'M[i, j]' represent the number of raters who assigned the 'i'th subject to the 'j'th category.
+    :type: numpy matrix
+    :rtype: float
+    :return: Fleiss' kappa score
+    """
+    N, k = M.shape  # N is # of items, k is # of categories
+    n_annotators = float(np.sum(M[0, :]))  # # of annotators
+    tot_annotations = N * n_annotators  # the total # of annotations
+    category_sum = np.sum(M, axis=0)  # the sum of each category over all items
+    # chance agreement
+    p = category_sum / tot_annotations  # the distribution of each category over all annotations
+    PbarE = np.sum(p * p)  # average chance agreement over all categories
+    # observed agreement
+    P = (np.sum(M * M, axis=1) - n_annotators) / (n_annotators * (n_annotators - 1))
+    Pbar = np.sum(P) / N  # add all observed agreement chances per item and divide by amount of items
+    return round((Pbar - PbarE) / (1 - PbarE), 4)
+def cohen_kappa_function(ann1, ann2):
+    """Computes Cohen kappa for pair-wise annotators.
+    :param ann1: annotations provided by first annotator
+    :type ann1: list
+    :param ann2: annotations provided by second annotator
+    :type ann2: list
+    :rtype: float
+    :return: Cohen kappa statistic
+    """
+    count = 0
+    for an1, an2 in zip(ann1, ann2):
+        if an1 == an2:
+            count += 1
+    A = count / len(ann1)  # observed agreement A (Po)
+    uniq = set(ann1 + ann2)
+    E = 0  # expected agreement E (Pe)
+    for item in uniq:
+        cnt1 = ann1.count(item)
+        cnt2 = ann2.count(item)
+        count = (cnt1 / len(ann1)) * (cnt2 / len(ann2))
+        E += count
+    return round((A - E) / (1 - E), 4)
+def interpret_kappa(score):
+    color = ""
+    if score < 0:
+        color= "#e74c3c;"
+    elif 0.01 <= score <= 0.20:
+        color= "#f39c12;"
+    elif 0.21 <= score <= 0.40:
+        color= "#f4d03f;"
+    elif 0.41 <= score <= 0.60:
+        color= "#5dade2;"
+    elif 0.61 <= score <= 0.80:
+        color= "#58d68d;"
+    elif 0.81 <= score <= 0.99:
+        color= "#28b463;"
+    return f"<span style='font-size:30px; color: {color}'>{round(score*100, 2)} %</span>"

n4a_analytics_lib/project.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# -*- coding:utf-8 -*-
+from io import BytesIO
+import re
+from zipfile import ZipFile
+import os
+from pathlib import Path
+from cassis import load_typesystem, load_cas_from_xmi
+from n4a_analytics_lib.st_components import st_pb
+class Project:
+    def __init__(self, zip_project, type):
+        # zip container that contains XMI and typesystem
+        self.zip_project = zip_project
+        # 'iaa' or 'global'
+        self.type = type
+        # store source filename
+        self.documents = []
+        # store XMI representation
+        self.xmi_documents = []
+        # store typesystem file
+        self.typesystem = None  # cassis.load_typesystem(BytesIO(annotation_zip.read('TypeSystem.xml')))
+        # set annotators
+        self.annotators = []
+        # set annotations
+        """
+        {
+            "Filename.xmi": {
+                mentions: [],
+                labels: []
+            }, ...
+        }
+        """
+        self.annotations = {}
+        with ZipFile(self.zip_project) as project_zip:
+            if self.type == "global":
+                regex = re.compile('.*curation/.*/(?!\._).*zip$')
+            elif self.type == "iaa":
+                regex = re.compile('.*xm[il]$')
+            annotation_fps = (fp for fp in project_zip.namelist() if regex.match(fp))
+            for fp in annotation_fps:
+                if self.type == "global":
+                    with ZipFile(BytesIO(project_zip.read(fp))) as annotation_zip:
+                        if self.typesystem is None:
+                            self.typesystem = load_typesystem(BytesIO(annotation_zip.read('TypeSystem.xml')))
+                        for f in annotation_zip.namelist():
+                            if f.endswith('.xmi'):
+                                # store source filename
+                                self.documents.append(Path(fp).parent.name)
+                                # annotators = []
+                                # store XMI representation
+                                self.xmi_documents.append(str(annotation_zip.read(f).decode("utf-8")))
+                elif self.type == "iaa":
+                    if self.typesystem is None and fp.endswith('.xml'):
+                        self.typesystem = load_typesystem(BytesIO(project_zip.read('TypeSystem.xml')))
+                    else:
+                        if fp.endswith('.xmi'):
+                            # store source filename
+                            self.documents.append(fp)
+                            # set annotators
+                            self.annotators.append(os.path.splitext(fp)[0])
+                            # store XMI representation
+                            self.xmi_documents.append(str(project_zip.read(fp).decode("utf-8")))
+        self.extract_ne()
+    @st_pb
+    def extract_ne(self):
+        count = 0
+        for xmi, src in zip(self.xmi_documents, self.documents):
+            doc_flag = True
+            try:
+                cas = load_cas_from_xmi(xmi, typesystem=self.typesystem)
+                self.annotations[src] = {
+                        "mentions": [],
+                        "labels": []
+                }
+                for ne in cas.select('de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity'):
+                    self.annotations[src]["mentions"].append(ne.get_covered_text())
+                    self.annotations[src]["labels"].append(ne.value)
+            except:
+                doc_flag = False
+            count += 1
+            yield (count / len(self.documents)) * 1.0, src, doc_flag

n4a_analytics_lib/st_components.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# -*- coding:utf-8 -*-
+import streamlit as st
+def st_pb(method):
+    def progress_bar(ref):
+        container = st.empty()
+        bar = st.progress(0)
+        pg_gen = method(ref)
+        try:
+            while True:
+                progress = next(pg_gen)
+                bar.progress(progress[0])
+                if progress[2]:
+                    container.write("✅ Processing... " + progress[1])
+                else:
+                    container.write("❌️ Errror with..." + progress[1])
+        except StopIteration as result:
+            return result.value
+    return progress_bar

requirements.txt ADDED Viewed

	@@ -0,0 +1,139 @@

+altair==4.2.0
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.0.7
+attrs==21.2.0
+backcall==0.2.0
+backports.zoneinfo==0.2.1
+beautifulsoup4==4.11.1
+bleach==5.0.1
+blinker==1.5
+cachetools==5.2.0
+certifi==2022.6.15
+cffi==1.15.1
+charset-normalizer==2.1.0
+click==8.1.3
+colorama==0.4.5
+commonmark==0.9.1
+cvxopt==1.2.7
+cvxpy==1.2.1
+cycler==0.11.0
+debugpy==1.6.2
+decorator==5.1.1
+defusedxml==0.7.1
+deprecation==2.1.0
+dkpro-cassis==0.7.2
+ecos==2.0.10
+entrypoints==0.4
+executing==0.9.1
+fastjsonschema==2.16.1
+fonttools==4.37.1
+gitdb==4.0.9
+GitPython==3.1.27
+idna==3.3
+importlib-metadata==4.12.0
+importlib-resources==5.4.0
+inceptalytics==0.1.0
+ipykernel==6.15.1
+ipython==8.4.0
+ipython-genutils==0.2.0
+ipywidgets==7.7.1
+jedi==0.18.1
+Jinja2==3.1.2
+joblib==1.1.0
+jsonschema==4.9.1
+jupyter-client==7.3.4
+jupyter-core==4.11.1
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==1.1.1
+kiwisolver==1.4.4
+krippendorff==0.5.1
+llvmlite==0.39.1
+lxml==4.9.1
+MarkupSafe==2.1.1
+matplotlib==3.5.3
+matplotlib-inline==0.1.3
+mistune==0.8.4
+more-itertools==8.12.0
+nbclient==0.6.6
+nbconvert==6.5.3
+nbformat==5.4.0
+nest-asyncio==1.5.5
+nltk==3.7
+notebook==6.4.12
+numba==0.56.2
+numpy==1.20.0
+osqp==0.6.2.post5
+packaging==21.3
+pandas==1.4.3
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.2.0
+pkgutil_resolve_name==1.3.10
+plotly==5.10.0
+prometheus-client==0.14.1
+prompt-toolkit==3.0.30
+protobuf==3.20.1
+psutil==5.9.1
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyannote.core==4.5
+pyannote.database==4.1.3
+pyarrow==9.0.0
+pycaprio==0.2.1
+pycparser==2.21
+pydeck==0.7.1
+pygamma-agreement==0.5.6
+Pygments==2.12.0
+pympi-ling==1.70.2
+Pympler==1.0.1
+pyparsing==3.0.9
+pyrsistent==0.18.1
+python-dateutil==2.8.2
+pytz==2022.1
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+pyzmq==23.2.0
+qdldl==0.1.5.post2
+regex==2022.7.25
+requests==2.28.1
+requests-toolbelt==0.9.1
+rich==12.5.1
+scikit-learn==1.1.2
+scipy==1.9.1
+scs==3.2.0
+seaborn==0.11.2
+semver==2.13.0
+Send2Trash==1.8.0
+shellingham==1.5.0
+simplejson==3.17.6
+six==1.16.0
+smmap==5.0.0
+sortedcontainers==2.4.0
+soupsieve==2.3.2.post1
+stack-data==0.3.0
+streamlit==1.12.2
+tenacity==8.0.1
+terminado==0.15.0
+TextGrid==1.5
+threadpoolctl==3.1.0
+tinycss2==1.1.1
+toml==0.10.2
+toolz==0.12.0
+toposort==1.7
+tornado==6.2
+tqdm==4.64.0
+traitlets==5.3.0
+typer==0.6.1
+typing_extensions==4.3.0
+tzdata==2022.1
+tzlocal==4.2
+urllib3==1.26.11
+validators==0.20.0
+watchdog==2.1.9
+wcwidth==0.2.5
+webencodings==0.5.1
+widgetsnbextension==3.6.1
+zipp==3.8.1