Spaces:
Sleeping
Sleeping
[feature] add credentials form to retrieve data via inception API remote
Browse files- .gitignore +3 -1
- app.py +153 -33
- n4a_analytics_lib/__pycache__/__init__.cpython-38.pyc +0 -0
- n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc +0 -0
- n4a_analytics_lib/__pycache__/project.cpython-38.pyc +0 -0
- n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc +0 -0
- n4a_analytics_lib/analytics.py +2 -3
- n4a_analytics_lib/constants.py +14 -0
- n4a_analytics_lib/project.py +45 -32
.gitignore
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
venv/
|
2 |
-
.idea/
|
|
|
|
|
|
1 |
venv/
|
2 |
+
.idea/
|
3 |
+
__pycache__/
|
4 |
+
LEGACY.py
|
app.py
CHANGED
@@ -1,45 +1,54 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
# -*- coding:utf-8 -*-
|
3 |
-
|
4 |
-
|
5 |
|
6 |
import streamlit as st
|
7 |
from streamlit.components.v1 import html
|
8 |
from n4a_analytics_lib.analytics import (GlobalStatistics, IaaStatistics)
|
|
|
9 |
|
10 |
|
11 |
-
TITLE = "NER4ARCHIVES Analytics"
|
12 |
-
|
13 |
# Set application
|
14 |
st.set_page_config(layout="wide")
|
15 |
-
|
16 |
# sidebar: meta, inputs etc.
|
17 |
sidebar = st.sidebar
|
18 |
# cols: display results
|
19 |
col1, col2 = st.columns(2)
|
20 |
|
21 |
# description
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
# Level to analyze
|
36 |
-
option = sidebar.selectbox('Which statistics level?', ('Inter-Annotator Agreement results',
|
|
|
37 |
|
38 |
# IAA results view
|
39 |
if option == "Inter-Annotator Agreement results":
|
40 |
annotations = sidebar.file_uploader("Upload IAA annotations (.zip format only): ")
|
41 |
baseline_text = sidebar.file_uploader("Upload baseline text (.txt format only): ")
|
42 |
|
|
|
43 |
if baseline_text is not None and annotations is not None:
|
44 |
project_analyzed = IaaStatistics(zip_project=annotations, baseline_text=baseline_text.getvalue())
|
45 |
baseline_analyzer = project_analyzed.analyze_text()
|
@@ -308,24 +317,135 @@ if option == "Inter-Annotator Agreement results":
|
|
308 |
st.pyplot(f.figure)
|
309 |
|
310 |
# global project results view
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
|
312 |
|
313 |
-
# to st components
|
314 |
-
def clear_cache():
|
315 |
-
st.session_state["p_a"] = None
|
316 |
|
317 |
if option == "Global project statistics":
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
|
331 |
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
# -*- coding:utf-8 -*-
|
3 |
+
import requests.exceptions
|
4 |
+
import zipfile
|
5 |
|
6 |
import streamlit as st
|
7 |
from streamlit.components.v1 import html
|
8 |
from n4a_analytics_lib.analytics import (GlobalStatistics, IaaStatistics)
|
9 |
+
from n4a_analytics_lib.constants import (DESCRIPTION)
|
10 |
|
11 |
|
|
|
|
|
12 |
# Set application
|
13 |
st.set_page_config(layout="wide")
|
|
|
14 |
# sidebar: meta, inputs etc.
|
15 |
sidebar = st.sidebar
|
16 |
# cols: display results
|
17 |
col1, col2 = st.columns(2)
|
18 |
|
19 |
# description
|
20 |
+
sidebar.markdown(DESCRIPTION)
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
# to st components
|
26 |
+
#def clear_cache():
|
27 |
+
# st.session_state = {}
|
28 |
+
|
29 |
+
def check_login(username, password):
|
30 |
+
if (len(username) == 0) or (len(password) == 0):
|
31 |
+
return False
|
32 |
+
return True
|
33 |
+
|
34 |
+
def logout():
|
35 |
+
pass
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
|
42 |
# Level to analyze
|
43 |
+
option = sidebar.selectbox('Which statistics level?', ('Inter-Annotator Agreement results',
|
44 |
+
'Global project statistics'))
|
45 |
|
46 |
# IAA results view
|
47 |
if option == "Inter-Annotator Agreement results":
|
48 |
annotations = sidebar.file_uploader("Upload IAA annotations (.zip format only): ")
|
49 |
baseline_text = sidebar.file_uploader("Upload baseline text (.txt format only): ")
|
50 |
|
51 |
+
|
52 |
if baseline_text is not None and annotations is not None:
|
53 |
project_analyzed = IaaStatistics(zip_project=annotations, baseline_text=baseline_text.getvalue())
|
54 |
baseline_analyzer = project_analyzed.analyze_text()
|
|
|
317 |
st.pyplot(f.figure)
|
318 |
|
319 |
# global project results view
|
320 |
+
# st_session = {"gs_local":True, "gs_remote":False, "gs_obj":<object>}
|
321 |
+
|
322 |
+
def display_data():
|
323 |
+
col1.metric("Total curated annotations",
|
324 |
+
f"{st.session_state['gs_obj'].total_annotations_project} Named entities")
|
325 |
+
col1.dataframe(st.session_state['gs_obj'].df_i)
|
326 |
+
selected_data = col1.selectbox('Select specific data to display bar plot:',
|
327 |
+
st.session_state['gs_obj'].documents, key="selector_data")
|
328 |
+
col2.pyplot(st.session_state['gs_obj'].create_plot(selected_data))
|
329 |
+
|
330 |
+
def init_session_statistics(remote: bool, local: bool, data: tuple) -> None:
|
331 |
+
# clear session
|
332 |
+
st.session_state = {}
|
333 |
+
|
334 |
+
# create a session variable
|
335 |
+
st.session_state["gs_local"] = local
|
336 |
+
st.session_state["gs_remote"] = remote
|
337 |
+
|
338 |
+
# create a new object:
|
339 |
+
# if remote fetch data from API Host first
|
340 |
+
if remote and not(local):
|
341 |
+
st.success('Fetch curated documents from host INCEpTION API in progress...')
|
342 |
+
fetch_curated_data_from_remote(
|
343 |
+
username=data[0],
|
344 |
+
password=data[1]
|
345 |
+
)
|
346 |
+
|
347 |
+
if local and not(remote):
|
348 |
+
st.session_state["gs_obj"] = GlobalStatistics(zip_project=data, remote=False)
|
349 |
+
|
350 |
+
|
351 |
+
|
352 |
+
|
353 |
+
|
354 |
+
from pycaprio import Pycaprio, mappings
|
355 |
+
from zipfile import ZipFile
|
356 |
+
import io
|
357 |
+
import requests
|
358 |
+
|
359 |
+
def fetch_curated_data_from_remote(username: str,
|
360 |
+
password: str,
|
361 |
+
endpoint: str = "https://inception.dhlab.epfl.ch/prod",
|
362 |
+
project_title: str = "ner4archives-template"):
|
363 |
+
# open a client
|
364 |
+
try:
|
365 |
+
client = Pycaprio(inception_host=endpoint, authentication=(str(username), str(password)))
|
366 |
+
except requests.exceptions.JSONDecodeError:
|
367 |
+
# username / password incorrect
|
368 |
+
st.error('Username or Password is incorrect please retry.')
|
369 |
+
|
370 |
+
# get project object
|
371 |
+
project_name = [p for p in client.api.projects() if p.project_name == project_title]
|
372 |
+
|
373 |
+
# get all documents from project
|
374 |
+
documents = client.api.documents(project_name[0].project_id)
|
375 |
+
|
376 |
+
curations = []
|
377 |
+
zipfiles = []
|
378 |
+
count = 0
|
379 |
+
flag = "a"
|
380 |
+
# iterate over all documents and retrieve only curated into ZIP container
|
381 |
+
for document in documents:
|
382 |
+
if count > 0:
|
383 |
+
flag = "r"
|
384 |
+
if document.document_state == mappings.DocumentState.CURATION_COMPLETE:
|
385 |
+
curated_content = client.api.curation(project_name[0].project_id, document,
|
386 |
+
curation_format=mappings.InceptionFormat.UIMA_CAS_XMI_XML_1_1)
|
387 |
+
curations.append(curated_content)
|
388 |
+
for curation in curations:
|
389 |
+
z = ZipFile(io.BytesIO(curation), mode=flag)
|
390 |
+
zipfiles.append(z)
|
391 |
+
|
392 |
+
count += 1
|
393 |
+
|
394 |
+
# Merge all zip in one
|
395 |
+
with zipfiles[0] as z1:
|
396 |
+
for fname in zipfiles[1:]:
|
397 |
+
zf = fname
|
398 |
+
# print(zf.namelist())
|
399 |
+
for n in zf.namelist():
|
400 |
+
if n not in z1.namelist():
|
401 |
+
z1.writestr(n, zf.open(n).read())
|
402 |
+
|
403 |
+
# Create a new object
|
404 |
+
st.session_state["gs_obj"] = GlobalStatistics(zip_project=z1, remote=True)
|
405 |
+
|
406 |
|
407 |
|
|
|
|
|
|
|
408 |
|
409 |
if option == "Global project statistics":
|
410 |
+
# User input controllers
|
411 |
+
mode = sidebar.radio("Choose mode to retrieve curated data: ", (
|
412 |
+
"Local directory", "INCEpTION API Host remote"
|
413 |
+
))
|
414 |
+
data = None
|
415 |
+
if mode == "Local directory":
|
416 |
+
project = sidebar.file_uploader("Folder that contains curated annotations in XMI 1.1 (.zip format only): ",
|
417 |
+
type="zip")
|
418 |
+
data = project
|
419 |
+
if mode == "INCEpTION API Host remote":
|
420 |
+
username = sidebar.text_input("Username: ")
|
421 |
+
password = sidebar.text_input("Password: ", type="password")
|
422 |
+
data = (username, password)
|
423 |
+
|
424 |
+
# Validate inputs
|
425 |
+
btn_process = sidebar.button('Process', key='process')
|
426 |
+
|
427 |
+
# Access data with local ressources
|
428 |
+
if btn_process and mode == "Local directory":
|
429 |
+
if data is not None:
|
430 |
+
# create a new session
|
431 |
+
init_session_statistics(remote=False, local=True, data=data)
|
432 |
+
|
433 |
+
# Access data with remote ressources
|
434 |
+
if btn_process and mode == "API Host remote":
|
435 |
+
if data is not None:
|
436 |
+
if check_login(username=data[0], password=data[1]):
|
437 |
+
# create a new session
|
438 |
+
init_session_statistics(remote=True, local=False, data=data)
|
439 |
+
else:
|
440 |
+
st.error("Sorry! Username or Password is empty.")
|
441 |
+
|
442 |
+
# Change data values and visualize new plot
|
443 |
+
if "gs_obj" in st.session_state:
|
444 |
+
if st.session_state["gs_local"] or st.session_state["gs_remote"]:
|
445 |
+
display_data()
|
446 |
+
|
447 |
+
|
448 |
+
|
449 |
+
|
450 |
|
451 |
|
n4a_analytics_lib/__pycache__/__init__.cpython-38.pyc
CHANGED
Binary files a/n4a_analytics_lib/__pycache__/__init__.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/__init__.cpython-38.pyc differ
|
|
n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc
CHANGED
Binary files a/n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc differ
|
|
n4a_analytics_lib/__pycache__/project.cpython-38.pyc
CHANGED
Binary files a/n4a_analytics_lib/__pycache__/project.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/project.cpython-38.pyc differ
|
|
n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc
CHANGED
Binary files a/n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc differ
|
|
n4a_analytics_lib/analytics.py
CHANGED
@@ -15,9 +15,8 @@ from n4a_analytics_lib.project import Project
|
|
15 |
|
16 |
|
17 |
class GlobalStatistics(Project):
|
18 |
-
def __init__(self, zip_project):
|
19 |
-
super().__init__(zip_project=zip_project, type="global")
|
20 |
-
|
21 |
self.data = [(src_file, ne_label) for src_file, ann in self.annotations.items() for ne_label in ann['labels']]
|
22 |
self.df_base = pd.DataFrame(self.data, columns=["SOURCE_FILE", "LABEL"])
|
23 |
self.df_i = self.df_base.groupby(["LABEL"])["LABEL"].count().reset_index(name="TOTAL")
|
|
|
15 |
|
16 |
|
17 |
class GlobalStatistics(Project):
|
18 |
+
def __init__(self, zip_project, remote=False):
|
19 |
+
super().__init__(zip_project=zip_project, remote=remote, type="global")
|
|
|
20 |
self.data = [(src_file, ne_label) for src_file, ann in self.annotations.items() for ne_label in ann['labels']]
|
21 |
self.df_base = pd.DataFrame(self.data, columns=["SOURCE_FILE", "LABEL"])
|
22 |
self.df_i = self.df_base.groupby(["LABEL"])["LABEL"].count().reset_index(name="TOTAL")
|
n4a_analytics_lib/constants.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding:utf-8 -*-
|
2 |
+
|
3 |
+
TITLE = "NER4ARCHIVES Analytics"
|
4 |
+
DESCRIPTION = f"""
|
5 |
+
# π {TITLE}
|
6 |
+
|
7 |
+
A basic web application to display a dashboard for
|
8 |
+
analyzing INCEpTION annotation project built in context
|
9 |
+
of NER4Archives (Inria/Archives nationales).
|
10 |
+
|
11 |
+
- This tool provides two statistics levels:
|
12 |
+
- *Global project statistics*: Analyze named entities in overall curated documents in project;
|
13 |
+
- *Inter-Annotator Agreement results*: Analyze results of IAA experiment.
|
14 |
+
"""
|
n4a_analytics_lib/project.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
# -*- coding:utf-8 -*-
|
2 |
-
|
3 |
from io import BytesIO
|
4 |
import re
|
5 |
from zipfile import ZipFile
|
@@ -13,9 +13,12 @@ from n4a_analytics_lib.st_components import st_pb
|
|
13 |
|
14 |
|
15 |
class Project:
|
16 |
-
def __init__(self, zip_project, type):
|
17 |
# zip container that contains XMI and typesystem
|
18 |
self.zip_project = zip_project
|
|
|
|
|
|
|
19 |
# 'iaa' or 'global'
|
20 |
self.type = type
|
21 |
|
@@ -42,36 +45,46 @@ class Project:
|
|
42 |
self.annotations = {}
|
43 |
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
|
77 |
self.extract_ne()
|
|
|
1 |
# -*- coding:utf-8 -*-
|
2 |
+
import zipfile
|
3 |
from io import BytesIO
|
4 |
import re
|
5 |
from zipfile import ZipFile
|
|
|
13 |
|
14 |
|
15 |
class Project:
|
16 |
+
def __init__(self, zip_project, type, remote):
|
17 |
# zip container that contains XMI and typesystem
|
18 |
self.zip_project = zip_project
|
19 |
+
|
20 |
+
self.remote = remote
|
21 |
+
|
22 |
# 'iaa' or 'global'
|
23 |
self.type = type
|
24 |
|
|
|
45 |
self.annotations = {}
|
46 |
|
47 |
|
48 |
+
if isinstance(self.zip_project, zipfile.ZipFile) and self.remote and self.type == "global":
|
49 |
+
for fp in self.zip_project.namelist():
|
50 |
+
if self.typesystem is None:
|
51 |
+
self.typesystem = load_typesystem(BytesIO(self.zip_project.open('TypeSystem.xml').read()))
|
52 |
+
if fp.endswith('.xmi'):
|
53 |
+
self.documents.append(fp)
|
54 |
+
self.xmi_documents.append(str(self.zip_project.open(fp).read().decode("utf-8")))
|
55 |
+
|
56 |
+
|
57 |
+
else:
|
58 |
+
with ZipFile(self.zip_project) as project_zip:
|
59 |
+
if self.type == "global":
|
60 |
+
regex = re.compile('.*curation/.*/(?!\._).*zip$')
|
61 |
+
elif self.type == "iaa":
|
62 |
+
regex = re.compile('.*xm[il]$')
|
63 |
+
|
64 |
+
annotation_fps = (fp for fp in project_zip.namelist() if regex.match(fp))
|
65 |
+
for fp in annotation_fps:
|
66 |
+
if self.type == "global":
|
67 |
+
with ZipFile(BytesIO(project_zip.read(fp))) as annotation_zip:
|
68 |
+
if self.typesystem is None:
|
69 |
+
self.typesystem = load_typesystem(BytesIO(annotation_zip.read('TypeSystem.xml')))
|
70 |
+
for f in annotation_zip.namelist():
|
71 |
+
if f.endswith('.xmi'):
|
72 |
+
# store source filename
|
73 |
+
self.documents.append(Path(fp).parent.name)
|
74 |
+
# annotators = []
|
75 |
+
# store XMI representation
|
76 |
+
self.xmi_documents.append(str(annotation_zip.read(f).decode("utf-8")))
|
77 |
+
elif self.type == "iaa":
|
78 |
+
if self.typesystem is None and fp.endswith('.xml'):
|
79 |
+
self.typesystem = load_typesystem(BytesIO(project_zip.read('TypeSystem.xml')))
|
80 |
+
else:
|
81 |
+
if fp.endswith('.xmi'):
|
82 |
+
# store source filename
|
83 |
+
self.documents.append(fp)
|
84 |
+
# set annotators
|
85 |
+
self.annotators.append(os.path.splitext(fp)[0])
|
86 |
+
# store XMI representation
|
87 |
+
self.xmi_documents.append(str(project_zip.read(fp).decode("utf-8")))
|
88 |
|
89 |
|
90 |
self.extract_ne()
|