lterriel commited on
Commit
519b419
1 Parent(s): 73f630f
app.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding:utf-8 -*-
3
+
4
+
5
+
6
+ import streamlit as st
7
+
8
+ from n4a_analytics_lib.analytics import (GlobalStatistics, IaaStatistics)
9
+
10
+
11
+ TITLE = "NER4ARCHIVES Analytics"
12
+
13
+ # Set application
14
+ st.set_page_config(layout="wide")
15
+
16
+ # sidebar: meta, inputs etc.
17
+ sidebar = st.sidebar
18
+ # cols: display results
19
+ col1, col2 = st.columns(2)
20
+
21
+ # description
22
+ #sidebar.markdown(f"# 📏 {TITLE}")
23
+ sidebar.markdown(f"""
24
+ # 📏 {TITLE}
25
+
26
+ A basic web application to display a dashboard for
27
+ analyzing INCEpTION annotation project built in context
28
+ of NER4Archives (Inria/Archives nationales).
29
+
30
+ - This tool provides two statistics levels:
31
+ - *Global project statistics*: Analyze named entities in overall curated documents in project;
32
+ - *Inter-Annotator Agreement results*: Analyze results of IAA experiment.
33
+ """)
34
+
35
+ # Level to analyze
36
+ option = sidebar.selectbox('Which statistics level?', ('Inter-Annotator Agreement results', 'Global project statistics'))
37
+
38
+ # IAA results view
39
+ if option == "Inter-Annotator Agreement results":
40
+ annotations = sidebar.file_uploader("Upload IAA annotations (.zip format only): ")
41
+ baseline_text = sidebar.file_uploader("Upload baseline text (.txt format only): ")
42
+
43
+ if baseline_text is not None and annotations is not None:
44
+ project_analyzed = IaaStatistics(zip_project=annotations, baseline_text=baseline_text.getvalue())
45
+ baseline_analyzer = project_analyzed.analyze_text()
46
+
47
+ col2.markdown(f"""
48
+ ### BASELINE TEXT: {baseline_text.name}
49
+
50
+ - sentences: {baseline_analyzer[0]}
51
+ - words: {baseline_analyzer[1]}
52
+ - characters: {baseline_analyzer[2]}
53
+ """)
54
+
55
+
56
+
57
+
58
+ #print(project_analyzed.annotations_per_coders)
59
+
60
+ commune_mentions = [l for i,j in project_analyzed.mentions_per_coder.items() for l in j]
61
+ commune_mentions = list(dict.fromkeys(commune_mentions))
62
+ #print(commune_mentions)
63
+ #print(project_analyzed.annotations)
64
+ #print(project_analyzed.labels_per_coder)
65
+ import pandas as pd
66
+ from collections import defaultdict, Counter
67
+ from itertools import combinations
68
+ import seaborn as sn
69
+ import matplotlib as plt
70
+ import matplotlib.pyplot as pylt
71
+
72
+ dicts_coders = []
73
+ for coder, annotations in project_analyzed.annotations_per_coders.items():
74
+ nombre_annotations = []
75
+ # print(f'* {coder}')
76
+ for annotation, label in annotations.items():
77
+ nombre_annotations.append(label)
78
+ # print(f"Nombre total d'annotations : {len(nombre_annotations)}")
79
+ dict_coder = dict(Counter(nombre_annotations))
80
+ dicts_coders.append(dict_coder)
81
+ # print(f'==========================')
82
+
83
+ labels = [label for label in dicts_coders[0]]
84
+
85
+ from n4a_analytics_lib.metrics_utils import interpret_kappa, fleiss_kappa_function, cohen_kappa_function
86
+ df = pd.DataFrame(project_analyzed.annotations_per_coders, index=commune_mentions)
87
+
88
+ for ann in project_analyzed.annotators:
89
+ df[ann] = 'None'
90
+ for mention, value in project_analyzed.annotations_per_coders[ann].items():
91
+ df.loc[mention, ann] = value
92
+
93
+ total_annotations = len(df)
94
+
95
+ # print(f'* Total des annotations : {total_annotations}')
96
+
97
+ df_n = df.apply(pd.Series.value_counts, 1).fillna(0).astype(int)
98
+ matrix = df_n.values
99
+
100
+ pairs = list(combinations(project_analyzed.annotations_per_coders, 2))
101
+
102
+ # Display in app
103
+ cont_kappa = st.container()
104
+ cont_kappa.title("Inter-Annotator Agreement (IAA) results")
105
+ tab1, tab2, tab3, tab4, tab5 = cont_kappa.tabs(
106
+ ["📈 IAA metrics", "🗃 IAA Metrics Legend", "✔️ Agree annotations", "❌ Disagree annotations",
107
+ "🏷️ Global Labels Statistics"])
108
+ tab1.subheader("Fleiss Kappa (global score for group):")
109
+
110
+
111
+
112
+ tab1.markdown(interpret_kappa(round(fleiss_kappa_function(matrix), 2)), unsafe_allow_html=True)
113
+ tab1.subheader("Cohen Kappa Annotators Matrix (score between annotators):")
114
+ # tab1.dataframe(df)
115
+ data = []
116
+ for coder_1, coder_2 in pairs:
117
+ cohen_function = cohen_kappa_function(project_analyzed.labels_per_coder[coder_1], project_analyzed.labels_per_coder[coder_2])
118
+ data.append(((coder_1, coder_2), cohen_function))
119
+ tab1.markdown(f"* {coder_1} <> {coder_2} : {interpret_kappa(cohen_function)}", unsafe_allow_html=True)
120
+ # print(f"* {coder_1} <> {coder_2} : {cohen_function}")
121
+
122
+ intermediary = defaultdict(Counter)
123
+ for (src, tgt), count in data:
124
+ intermediary[src][tgt] = count
125
+
126
+ letters = sorted({key for inner in intermediary.values() for key in inner} | set(intermediary.keys()))
127
+
128
+ confusion_matrix = [[intermediary[src][tgt] for tgt in letters] for src in letters]
129
+ import numpy as np
130
+
131
+ df_cm = pd.DataFrame(confusion_matrix, letters, letters)
132
+ mask = df_cm.values == 0
133
+ sn.set(font_scale=0.7) # for label size
134
+ colors = ["#e74c3c", "#f39c12", "#f4d03f", "#5dade2", "#58d68d", "#28b463"]
135
+ width = tab1.slider("matrix width", 1, 10, 14)
136
+ height = tab1.slider("matrix height", 1, 10, 4)
137
+ fig, ax = pylt.subplots(figsize=(width, height))
138
+ sn.heatmap(df_cm, cmap=colors, annot=True, mask=mask, annot_kws={"size": 7}, vmin=0, vmax=1, ax=ax) # font size
139
+ # plt.show()
140
+ tab1.pyplot(ax.figure)
141
+ tab2.markdown("""
142
+ <table>
143
+ <thead>
144
+ <tr>
145
+ <th
146
+ colspan="2"> Kappa
147
+ interpretation
148
+ legend </th>
149
+ </tr>
150
+ </thead>
151
+ <tbody>
152
+ <tr>
153
+ <td> Kappa
154
+ score(k) </td>
155
+ <td>Agreement</td>
156
+ </tr>
157
+ <tr
158
+ style = "background-color: #e74c3c;">
159
+ <td> k < 0 </td>
160
+ <td> Less
161
+ chance
162
+ agreement </td>
163
+ </tr>
164
+ <tr
165
+ style = "background-color: #f39c12;">
166
+ <td> 0.01 < k < 0.20 </td>
167
+ <td> Slight
168
+ agreement </td>
169
+ </tr>
170
+ <tr
171
+ style = "background-color: #f4d03f;">
172
+ <td> 0.21 < k < 0.40 </td>
173
+ <td> Fair
174
+ agreement </td>
175
+ </tr>
176
+ <tr
177
+ style = "background-color: #5dade2;">
178
+ <td> 0.41 < k < 0.60 </td>
179
+ <td> Moderate
180
+ agreement </td>
181
+ </tr>
182
+ <tr
183
+ style = "background-color: #58d68d;">
184
+ <td> 0.61 < k < 0.80 </td>
185
+ <td> Substantial
186
+ agreement </td>
187
+ </tr>
188
+ <tr
189
+ style = "background-color: #28b463;">
190
+ <td> 0.81 < k < 0.99 </td>
191
+ <td> Almost
192
+ perfect
193
+ agreement </td>
194
+ </tr>
195
+ </tbody>
196
+ </table>"""
197
+
198
+ , unsafe_allow_html = True)
199
+
200
+ ## commune
201
+ @st.cache
202
+ def convert_df(df_ex):
203
+ return df_ex.to_csv(encoding="utf-8").encode('utf-8')
204
+
205
+
206
+ ## Agree part
207
+
208
+ columns_to_compare = project_analyzed.annotators
209
+
210
+
211
+ def check_all_equal(iterator):
212
+ return len(set(iterator)) <= 1
213
+
214
+
215
+ df_agree = df[df[columns_to_compare].apply(lambda row: check_all_equal(row), axis=1)]
216
+ total_unanime = len(df_agree)
217
+
218
+ csv_agree = convert_df(df_agree)
219
+
220
+ tab3.subheader("Total agree annotations:")
221
+ tab3.markdown(f"{total_unanime} / {len(df)} annotations ({round((total_unanime / len(df)) * 100, 2)} %)")
222
+ tab3.download_button(
223
+ "Press to Download CSV",
224
+ csv_agree,
225
+ "csv_annotators_agree.csv",
226
+ "text/csv",
227
+ key='download-csv-1'
228
+ )
229
+ tab3.dataframe(df_agree)
230
+
231
+
232
+ ## Disagree part
233
+
234
+ def check_all_not_equal(iterator):
235
+ return len(set(iterator)) > 1
236
+
237
+
238
+ df_disagree = df[df[columns_to_compare].apply(lambda row: check_all_not_equal(row), axis=1)]
239
+ total_desaccord = len(df_disagree)
240
+ csv_disagree = convert_df(df_disagree)
241
+ tab4.subheader("Total disagree annotations:")
242
+ tab4.markdown(
243
+ f"{total_desaccord} / {len(df)} annotations ({round((total_desaccord / len(df)) * 100, 2)} %)")
244
+ tab4.download_button(
245
+ "Press to Download CSV",
246
+ csv_disagree,
247
+ "csv_annotators_disagree.csv",
248
+ "text/csv",
249
+ key='download-csv-2'
250
+ )
251
+ tab4.dataframe(df_disagree)
252
+
253
+
254
+ ## alignement chart labels
255
+ def count_total_annotations_label(dataframe, labels):
256
+ pairs = []
257
+ for label in labels:
258
+ total = dataframe.astype(object).eq(label).any(1).sum()
259
+ pairs.append((label, total))
260
+ return pairs
261
+
262
+
263
+ totals_annotations_per_labels = count_total_annotations_label(df, labels)
264
+
265
+
266
+ # Récupérer le nombre de mention portant la même classe selon les annotateurs
267
+
268
+ def total_agree_disagree_per_label(dataframe, pairs_totals_labels):
269
+ new_pairs = []
270
+ for t in pairs_totals_labels:
271
+ # t[0] : label
272
+ # t[1] : total_rows_with_label
273
+ agree_res = df[df.nunique(1).eq(1)].eq(t[0]).any(1).sum()
274
+ disagree_res = t[1] - agree_res
275
+ agree_percent = (agree_res / t[1]) * 100
276
+ disagree_percent = (disagree_res / t[1]) * 100
277
+ new_pairs.append((t[0], t[1], agree_percent, disagree_percent))
278
+ return new_pairs
279
+
280
+ to_pie = total_agree_disagree_per_label(df, totals_annotations_per_labels)
281
+
282
+
283
+ def plot_pies(tasks_to_pie):
284
+ my_labels = 'agree', 'disagree'
285
+ my_colors = ['#47DBCD', '#F5B14C']
286
+ my_explode = (0, 0.1)
287
+ counter = 0
288
+ fig, axes = pylt.subplots(1, len(tasks_to_pie), figsize=(20, 3))
289
+ for t in tasks_to_pie:
290
+ tasks = [t[2], t[3]]
291
+ axes[counter].pie(tasks, autopct='%1.1f%%', startangle=15, shadow=True, colors=my_colors,
292
+ explode=my_explode)
293
+ axes[counter].set_title(t[0])
294
+ axes[counter].axis('equal')
295
+ counter += 1
296
+ fig.set_facecolor("white")
297
+ fig.legend(labels=my_labels, loc="center right", borderaxespad=0.1, title="Labels alignement")
298
+ # plt.savefig(f'./out/pie_alignement_labels_{filename_no_extension}.png', dpi=400)
299
+ return fig
300
+
301
+ f = plot_pies(to_pie)
302
+ tab5.pyplot(f.figure)
303
+
304
+ # global project results view
305
+
306
+
307
+ # to st components
308
+ def clear_cache():
309
+ st.session_state["p_a"] = None
310
+
311
+ if option == "Global project statistics":
312
+ project = sidebar.file_uploader("Project folder that contains curated annotations in XMI 1.1 (.zip format only) : ", on_change=clear_cache)
313
+ if project is not None:
314
+ if st.session_state["p_a"] is None:
315
+ st.session_state["p_a"] = GlobalStatistics(zip_project=project)
316
+ if st.session_state["p_a"] is not None:
317
+ with st.expander('Details on data'):
318
+ col1.metric("Total curated annotations",
319
+ f"{st.session_state['p_a'].total_annotations_project} Named entities")
320
+ col1.dataframe(st.session_state['p_a'].df_i)
321
+ selected_data = col1.selectbox('Select specific data to display bar plot:',
322
+ st.session_state['p_a'].documents)
323
+ col2.pyplot(st.session_state['p_a'].create_plot(selected_data))
324
+
325
+
datatest/curation_rapid_global.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:804a01b2ffae53103cd67fa51671ccbbbc988cf2796ec40ccb20f1e9283c1b47
3
+ size 4670583
datatest/doc_baseline_IAA_4.txt ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Les Abymes, Association pour l'utilisation du Rein Artificiel (AUDRA) (code démarche : 6173), 2010-2011 ; Centre hospitalier gérontologique du Raizet (code démarche : 6182), 2006-2010 ; Polyclinique de La Guadeloupe (code démarche : 5838), 2008-2011 Pointe-à-Pitre, Centre hospitalier universitaire (CHU) Pointe-à-Pitre - Abymes (code démarche : 5777), 2010.
2
+
3
+ 74-734, cliché n°390, matricule n°23 : vue d'ensemble du torrent de Saint-Martin[-Sallanches] depuis les bords de l'Arve (KUSS, 4 ex, 1890) 74-735, cliché n°391, matricule n°24 : vue d'ensemble du torrent de Saint-Martin[-Sallanches] depuis les bords de l'Arve (KUSS, 1 ex, 1890).
4
+
5
+ « Établissements Barbaud, boucherie hippophagique, fabrique de saucissons et salaisons » : projets de statuts constitué par Fernand Édouard Marius Barbaud, Alice Blanche Barbaud et Pierre Roger Robert Barbaud ; statuts définitifs ; état des clients débiteurs et inventaire au 3 mars 1938 ; autre inventaire au 31 décembre 1938 ; estimation d’une propriété au 148-150 rue Castagnary, 15e arrondissement, dépendant de la succession de François Barbaud ; notes, comptes, baux, plans, correspondance.
6
+
7
+ MC/DC/CXX/103 « La Gazette du Franc, revue hebdomadaire d’information et d’économie politique et financière » 1930-1931.
8
+
9
+ 1PH/413 Neufchef à Rimling.
10
+
11
+ 1PH/414 Rohrbach-lès-Bitche à Saint-Louis.
12
+
13
+ 1PH/415 Saint-Louis-lès-Bitche à Zoufftgen.
14
+
15
+ 20100562/512 Reportages 4305 Obsèques du pape Jean Paul II au Vatican 08/04/2005.
16
+
17
+ 20100562/513 Reportages 4306 Entretien avec DRISS JETTOU Premier ministre du MAROC 09/04/2005.
18
+
19
+ PA_324 Documents concernant Henri Lambert (1897-1958), appelé en janvier 1916, affecté au 27e RI, déserteur le 5 mars 1916, emprisonné à Lons-le-Saunier, affecté ensuite au 29e RI, blessé et affecté au 106e RI 1919-2004.
20
+
21
+ Louis Veuillot (une lettre à Mgr de Salinis, relatant une entrevue avec Napoléon III, 1858, transmise au prince par le baron Durrien en 1902) ; Georges Vicaire ; colonel Villot.
22
+
23
+ CP/F/14/17531/19, pièce 128 "First Order Light House for Cape Henry, VA Plate 2" [Phare de premier ordre du Cap Henry, Virginie Planche 2] Dressé par Peter C HAINS, ingénieur secrétaire Mai 1879.
24
+
25
+ BB/29/1082 page 116 La Ferté-sous-Jouarre (Seine-et-Marne).
26
+
27
+ D'azur semé de fleurs de lys d'or, au lion passant du même.
28
+
29
+ BB/29/1082 page 117 Pont-Audemer (Eure).
30
+
31
+ N/III/Jura/12 « PLAN DES TERRES VAINES ET VAGUES SCITUEZ DERRIÈRE LES CAPUSINS DE LA VILLE DE DOLE » Jean-Pierre Baronnet, géomètre arpenteur royal à Besançon, 1775.
32
+
33
+ Pl d'une partie de la forêt de Chaux; lieux dits ; grand chemin conduisant à la Vieille Loye, chemin de la Grande Loye; baraques des Tuileries, broussailles Notice 1280.
34
+
35
+ MC/ET/VII/814 21 Règlement entre Désirée Anne Radigue, veuve de Bajamin dit Benjamin Bourdin, ayant demeuré route Stratégique, au coin de la rue Saint-Fargeau, propriétaire, demeurant 71, rue de Rochechouart, actuellement à Choisy-le-Roi (Val-de-Marne), aux héritiers de son mari, Claude Eustache Bourdin, Marie Claudine Bourdin, Pierre Louis Bourdin, les époux Chalbot 28 juin 1861.
36
+
37
+ RAPATEL, de Rennes, colonel d'artillerie en retraite Demande d'avancement pour son neveu, maréchal des logis 1855.
38
+
39
+ Projets de lois de réorganisation, 1945-1947 Effectifs de l'année 1946 Programmes annuel de tournée : année 1945 (missions effectuées pour le Ministère de la Justice) ; année 1949 (contrôle des services départementaux dépendant du Ministère de l'Intérieur) Méthodes de travail: exploitation des rapports des inspecteurs de l'IGA Ecole nationale d'Administration (ENA): projet de stage des élèves de l'ENA dans les préfectures 1946, organisation de l'examen de classement et résultats généraux du concours de l'année 1948 (promotion Croix de Lorraine) et de l'année 1949 (promotion Quarante-huit).
40
+
41
+ Z/1j/257 Estimation des réparations à faire en une maison, rue du Plâtre, acquise par François Mansart de Guy Arbaleste, vicomte de Melun, et de Marie de Montmorency, sa femme 11 mars 1639.
42
+
43
+ Z/1j/258 Estimation des réparations à faire en une maison, rue du Plâtre, appartenant à Richard Le Gras et louée à Guillaume Pierre 8 août 1639.
44
+
45
+ TAILHADE, Laurent Encouragements, indemnités et secours, 12 juillet 1918 - 13 mai 1925 F/21/4154.
46
+
47
+ TALBOT-READY, 5 août 1887 - 8 mai 1908 F/21/4455.
48
+
49
+ TANO, Nicolas, 28 mars 1890 - 19 novembre 1937 F/21/4455.
50
+
51
+ 1 Tourtour, Var, arr de Draguignan, cant de Salernes.
52
+
53
+ 106 Blois Confirmation des privilèges des doyen et chapitre de l'église Saint-Vulfran d'Abbeville (Fol 18, n° 106) Novembre 1501.
datatest/ner4archives-template_curated_documents_2022-09-05_1345.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:904667ecd2393a040a53d1b99137d123b1e98591cff71b7ae21ed91b710aa180
3
+ size 35874822
datatest/test.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab5ced7fa96a8b65ad8077d69761f517b19a57d8ec74e86608101d3bb66c6a54
3
+ size 74199
n4a_analytics_lib/__init__.py ADDED
File without changes
n4a_analytics_lib/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (212 Bytes). View file
 
n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc ADDED
Binary file (3.2 kB). View file
 
n4a_analytics_lib/__pycache__/metrics_utils.cpython-38.pyc ADDED
Binary file (2.2 kB). View file
 
n4a_analytics_lib/__pycache__/project.cpython-38.pyc ADDED
Binary file (2.32 kB). View file
 
n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc ADDED
Binary file (807 Bytes). View file
 
n4a_analytics_lib/analytics.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding:utf-8 -*-
2
+
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ import matplotlib
6
+
7
+ matplotlib.use('Agg')
8
+
9
+ import nltk
10
+
11
+ nltk.download('punkt')
12
+ from nltk.tokenize import sent_tokenize, word_tokenize
13
+
14
+ from n4a_analytics_lib.project import Project
15
+
16
+
17
+ class GlobalStatistics(Project):
18
+ def __init__(self, zip_project):
19
+ super().__init__(zip_project=zip_project, type="global")
20
+
21
+ self.data = [(src_file, ne_label) for src_file, ann in self.annotations.items() for ne_label in ann['labels']]
22
+ self.df_base = pd.DataFrame(self.data, columns=["SOURCE_FILE", "LABEL"])
23
+ self.df_i = self.df_base.groupby(["LABEL"])["LABEL"].count().reset_index(name="TOTAL")
24
+ self.df_details = self.df_base.groupby(["SOURCE_FILE", "LABEL"])["LABEL"].count().reset_index(name="TOTAL")
25
+
26
+ self.total_annotations_project = self.df_i['TOTAL'].sum()
27
+
28
+ def create_plot(self, type_data):
29
+ # apply data filter
30
+ data_tab_filtered = self.df_details.loc[self.df_details['SOURCE_FILE'] == type_data]
31
+ # create a new plot
32
+ ax = sns.barplot(x='LABEL', y='TOTAL', data=data_tab_filtered)
33
+ # add title to plot
34
+ ax.figure.suptitle(type_data)
35
+ # add value labels to bars
36
+ for container in ax.containers:
37
+ ax.bar_label(container)
38
+ return ax.figure
39
+
40
+
41
+
42
+
43
+
44
+ class IaaStatistics(Project):
45
+ def __init__(self, zip_project, baseline_text):
46
+ super().__init__(zip_project=zip_project, type="iaa")
47
+ self.baseline_text = baseline_text.decode('utf-8')
48
+
49
+ # self.docs = {}
50
+ # self.pairwise = {}
51
+ # self.similar_mention = []
52
+ self.mentions_per_coder = self.extract_refs(self.annotations, self.annotators, type="mentions")
53
+ self.labels_per_coder = self.extract_refs(self.annotations, self.annotators, type="labels")
54
+
55
+ self.annotations_per_coders = {coder: dict(zip(ann[1]['mentions'], ann[1]['labels'])) for coder, ann in zip(self.annotators, self.annotations.items())}
56
+
57
+
58
+ @staticmethod
59
+ def extract_refs(annotations, annotators, type):
60
+ return {
61
+ coder: data for coder, ann in zip(
62
+ annotators,
63
+ annotations.items()
64
+ ) for ref, data in ann[1].items() if ref == type
65
+ }
66
+
67
+ def analyze_text(self):
68
+ """returns total sentences, words and characters
69
+ in list format
70
+ """
71
+ return [
72
+ len(sent_tokenize(self.baseline_text, language="french")),
73
+ len(word_tokenize(self.baseline_text, language="french")),
74
+ len(self.baseline_text)
75
+ ]
n4a_analytics_lib/metrics_utils.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding:utf-8 -*-
2
+
3
+ import numpy as np
4
+
5
+ def fleiss_kappa_function(M):
6
+ """Computes Fleiss' kappa for group of annotators.
7
+ :param M: a matrix of shape (:attr:'N', :attr:'k') with 'N' = number of subjects and 'k' = the number of categories.
8
+ 'M[i, j]' represent the number of raters who assigned the 'i'th subject to the 'j'th category.
9
+ :type: numpy matrix
10
+ :rtype: float
11
+ :return: Fleiss' kappa score
12
+ """
13
+ N, k = M.shape # N is # of items, k is # of categories
14
+ n_annotators = float(np.sum(M[0, :])) # # of annotators
15
+ tot_annotations = N * n_annotators # the total # of annotations
16
+ category_sum = np.sum(M, axis=0) # the sum of each category over all items
17
+
18
+ # chance agreement
19
+ p = category_sum / tot_annotations # the distribution of each category over all annotations
20
+ PbarE = np.sum(p * p) # average chance agreement over all categories
21
+
22
+ # observed agreement
23
+ P = (np.sum(M * M, axis=1) - n_annotators) / (n_annotators * (n_annotators - 1))
24
+ Pbar = np.sum(P) / N # add all observed agreement chances per item and divide by amount of items
25
+
26
+ return round((Pbar - PbarE) / (1 - PbarE), 4)
27
+
28
+ def cohen_kappa_function(ann1, ann2):
29
+ """Computes Cohen kappa for pair-wise annotators.
30
+ :param ann1: annotations provided by first annotator
31
+ :type ann1: list
32
+ :param ann2: annotations provided by second annotator
33
+ :type ann2: list
34
+ :rtype: float
35
+ :return: Cohen kappa statistic
36
+ """
37
+ count = 0
38
+ for an1, an2 in zip(ann1, ann2):
39
+ if an1 == an2:
40
+ count += 1
41
+ A = count / len(ann1) # observed agreement A (Po)
42
+
43
+ uniq = set(ann1 + ann2)
44
+ E = 0 # expected agreement E (Pe)
45
+ for item in uniq:
46
+ cnt1 = ann1.count(item)
47
+ cnt2 = ann2.count(item)
48
+ count = (cnt1 / len(ann1)) * (cnt2 / len(ann2))
49
+ E += count
50
+
51
+ return round((A - E) / (1 - E), 4)
52
+
53
+ def interpret_kappa(score):
54
+ color = ""
55
+ if score < 0:
56
+ color= "#e74c3c;"
57
+ elif 0.01 <= score <= 0.20:
58
+ color= "#f39c12;"
59
+ elif 0.21 <= score <= 0.40:
60
+ color= "#f4d03f;"
61
+ elif 0.41 <= score <= 0.60:
62
+ color= "#5dade2;"
63
+ elif 0.61 <= score <= 0.80:
64
+ color= "#58d68d;"
65
+ elif 0.81 <= score <= 0.99:
66
+ color= "#28b463;"
67
+
68
+ return f"<span style='font-size:30px; color: {color}'>{round(score*100, 2)} %</span>"
n4a_analytics_lib/project.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding:utf-8 -*-
2
+
3
+ from io import BytesIO
4
+ import re
5
+ from zipfile import ZipFile
6
+ import os
7
+ from pathlib import Path
8
+
9
+
10
+ from cassis import load_typesystem, load_cas_from_xmi
11
+
12
+ from n4a_analytics_lib.st_components import st_pb
13
+
14
+
15
+ class Project:
16
+ def __init__(self, zip_project, type):
17
+ # zip container that contains XMI and typesystem
18
+ self.zip_project = zip_project
19
+ # 'iaa' or 'global'
20
+ self.type = type
21
+
22
+ # store source filename
23
+ self.documents = []
24
+ # store XMI representation
25
+ self.xmi_documents = []
26
+ # store typesystem file
27
+ self.typesystem = None # cassis.load_typesystem(BytesIO(annotation_zip.read('TypeSystem.xml')))
28
+
29
+ # set annotators
30
+ self.annotators = []
31
+ # set annotations
32
+ """
33
+ {
34
+ "Filename.xmi": {
35
+
36
+ mentions: [],
37
+ labels: []
38
+
39
+ }, ...
40
+ }
41
+ """
42
+ self.annotations = {}
43
+
44
+
45
+ with ZipFile(self.zip_project) as project_zip:
46
+ if self.type == "global":
47
+ regex = re.compile('.*curation/.*/(?!\._).*zip$')
48
+ elif self.type == "iaa":
49
+ regex = re.compile('.*xm[il]$')
50
+
51
+ annotation_fps = (fp for fp in project_zip.namelist() if regex.match(fp))
52
+ for fp in annotation_fps:
53
+ if self.type == "global":
54
+ with ZipFile(BytesIO(project_zip.read(fp))) as annotation_zip:
55
+ if self.typesystem is None:
56
+ self.typesystem = load_typesystem(BytesIO(annotation_zip.read('TypeSystem.xml')))
57
+ for f in annotation_zip.namelist():
58
+ if f.endswith('.xmi'):
59
+ # store source filename
60
+ self.documents.append(Path(fp).parent.name)
61
+ # annotators = []
62
+ # store XMI representation
63
+ self.xmi_documents.append(str(annotation_zip.read(f).decode("utf-8")))
64
+ elif self.type == "iaa":
65
+ if self.typesystem is None and fp.endswith('.xml'):
66
+ self.typesystem = load_typesystem(BytesIO(project_zip.read('TypeSystem.xml')))
67
+ else:
68
+ if fp.endswith('.xmi'):
69
+ # store source filename
70
+ self.documents.append(fp)
71
+ # set annotators
72
+ self.annotators.append(os.path.splitext(fp)[0])
73
+ # store XMI representation
74
+ self.xmi_documents.append(str(project_zip.read(fp).decode("utf-8")))
75
+
76
+
77
+ self.extract_ne()
78
+
79
+
80
+ @st_pb
81
+ def extract_ne(self):
82
+ count = 0
83
+ for xmi, src in zip(self.xmi_documents, self.documents):
84
+ doc_flag = True
85
+ try:
86
+ cas = load_cas_from_xmi(xmi, typesystem=self.typesystem)
87
+ self.annotations[src] = {
88
+ "mentions": [],
89
+ "labels": []
90
+ }
91
+ for ne in cas.select('de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity'):
92
+ self.annotations[src]["mentions"].append(ne.get_covered_text())
93
+ self.annotations[src]["labels"].append(ne.value)
94
+ except:
95
+ doc_flag = False
96
+
97
+ count += 1
98
+ yield (count / len(self.documents)) * 1.0, src, doc_flag
99
+
100
+
101
+
102
+
n4a_analytics_lib/st_components.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding:utf-8 -*-
2
+
3
+ import streamlit as st
4
+
5
+
6
+ def st_pb(method):
7
+ def progress_bar(ref):
8
+ container = st.empty()
9
+ bar = st.progress(0)
10
+ pg_gen = method(ref)
11
+ try:
12
+ while True:
13
+ progress = next(pg_gen)
14
+ bar.progress(progress[0])
15
+ if progress[2]:
16
+ container.write("✅ Processing... " + progress[1])
17
+ else:
18
+ container.write("❌️ Errror with..." + progress[1])
19
+ except StopIteration as result:
20
+ return result.value
21
+
22
+ return progress_bar
requirements.txt ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==4.2.0
2
+ argon2-cffi==21.3.0
3
+ argon2-cffi-bindings==21.2.0
4
+ asttokens==2.0.7
5
+ attrs==21.2.0
6
+ backcall==0.2.0
7
+ backports.zoneinfo==0.2.1
8
+ beautifulsoup4==4.11.1
9
+ bleach==5.0.1
10
+ blinker==1.5
11
+ cachetools==5.2.0
12
+ certifi==2022.6.15
13
+ cffi==1.15.1
14
+ charset-normalizer==2.1.0
15
+ click==8.1.3
16
+ colorama==0.4.5
17
+ commonmark==0.9.1
18
+ cvxopt==1.2.7
19
+ cvxpy==1.2.1
20
+ cycler==0.11.0
21
+ debugpy==1.6.2
22
+ decorator==5.1.1
23
+ defusedxml==0.7.1
24
+ deprecation==2.1.0
25
+ dkpro-cassis==0.7.2
26
+ ecos==2.0.10
27
+ entrypoints==0.4
28
+ executing==0.9.1
29
+ fastjsonschema==2.16.1
30
+ fonttools==4.37.1
31
+ gitdb==4.0.9
32
+ GitPython==3.1.27
33
+ idna==3.3
34
+ importlib-metadata==4.12.0
35
+ importlib-resources==5.4.0
36
+ inceptalytics==0.1.0
37
+ ipykernel==6.15.1
38
+ ipython==8.4.0
39
+ ipython-genutils==0.2.0
40
+ ipywidgets==7.7.1
41
+ jedi==0.18.1
42
+ Jinja2==3.1.2
43
+ joblib==1.1.0
44
+ jsonschema==4.9.1
45
+ jupyter-client==7.3.4
46
+ jupyter-core==4.11.1
47
+ jupyterlab-pygments==0.2.2
48
+ jupyterlab-widgets==1.1.1
49
+ kiwisolver==1.4.4
50
+ krippendorff==0.5.1
51
+ llvmlite==0.39.1
52
+ lxml==4.9.1
53
+ MarkupSafe==2.1.1
54
+ matplotlib==3.5.3
55
+ matplotlib-inline==0.1.3
56
+ mistune==0.8.4
57
+ more-itertools==8.12.0
58
+ nbclient==0.6.6
59
+ nbconvert==6.5.3
60
+ nbformat==5.4.0
61
+ nest-asyncio==1.5.5
62
+ nltk==3.7
63
+ notebook==6.4.12
64
+ numba==0.56.2
65
+ numpy==1.20.0
66
+ osqp==0.6.2.post5
67
+ packaging==21.3
68
+ pandas==1.4.3
69
+ pandocfilters==1.5.0
70
+ parso==0.8.3
71
+ pexpect==4.8.0
72
+ pickleshare==0.7.5
73
+ Pillow==9.2.0
74
+ pkgutil_resolve_name==1.3.10
75
+ plotly==5.10.0
76
+ prometheus-client==0.14.1
77
+ prompt-toolkit==3.0.30
78
+ protobuf==3.20.1
79
+ psutil==5.9.1
80
+ ptyprocess==0.7.0
81
+ pure-eval==0.2.2
82
+ pyannote.core==4.5
83
+ pyannote.database==4.1.3
84
+ pyarrow==9.0.0
85
+ pycaprio==0.2.1
86
+ pycparser==2.21
87
+ pydeck==0.7.1
88
+ pygamma-agreement==0.5.6
89
+ Pygments==2.12.0
90
+ pympi-ling==1.70.2
91
+ Pympler==1.0.1
92
+ pyparsing==3.0.9
93
+ pyrsistent==0.18.1
94
+ python-dateutil==2.8.2
95
+ pytz==2022.1
96
+ pytz-deprecation-shim==0.1.0.post0
97
+ PyYAML==6.0
98
+ pyzmq==23.2.0
99
+ qdldl==0.1.5.post2
100
+ regex==2022.7.25
101
+ requests==2.28.1
102
+ requests-toolbelt==0.9.1
103
+ rich==12.5.1
104
+ scikit-learn==1.1.2
105
+ scipy==1.9.1
106
+ scs==3.2.0
107
+ seaborn==0.11.2
108
+ semver==2.13.0
109
+ Send2Trash==1.8.0
110
+ shellingham==1.5.0
111
+ simplejson==3.17.6
112
+ six==1.16.0
113
+ smmap==5.0.0
114
+ sortedcontainers==2.4.0
115
+ soupsieve==2.3.2.post1
116
+ stack-data==0.3.0
117
+ streamlit==1.12.2
118
+ tenacity==8.0.1
119
+ terminado==0.15.0
120
+ TextGrid==1.5
121
+ threadpoolctl==3.1.0
122
+ tinycss2==1.1.1
123
+ toml==0.10.2
124
+ toolz==0.12.0
125
+ toposort==1.7
126
+ tornado==6.2
127
+ tqdm==4.64.0
128
+ traitlets==5.3.0
129
+ typer==0.6.1
130
+ typing_extensions==4.3.0
131
+ tzdata==2022.1
132
+ tzlocal==4.2
133
+ urllib3==1.26.11
134
+ validators==0.20.0
135
+ watchdog==2.1.9
136
+ wcwidth==0.2.5
137
+ webencodings==0.5.1
138
+ widgetsnbextension==3.6.1
139
+ zipp==3.8.1