lterriel commited on
Commit
8c99444
β€’
1 Parent(s): f208c9e

[feature] add credentials form to retrieve data via inception API remote

Browse files
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  venv/
2
- .idea/
 
 
 
1
  venv/
2
+ .idea/
3
+ __pycache__/
4
+ LEGACY.py
app.py CHANGED
@@ -1,45 +1,54 @@
1
  #!/usr/bin/env python3
2
  # -*- coding:utf-8 -*-
3
-
4
-
5
 
6
  import streamlit as st
7
  from streamlit.components.v1 import html
8
  from n4a_analytics_lib.analytics import (GlobalStatistics, IaaStatistics)
 
9
 
10
 
11
- TITLE = "NER4ARCHIVES Analytics"
12
-
13
  # Set application
14
  st.set_page_config(layout="wide")
15
-
16
  # sidebar: meta, inputs etc.
17
  sidebar = st.sidebar
18
  # cols: display results
19
  col1, col2 = st.columns(2)
20
 
21
  # description
22
- #sidebar.markdown(f"# πŸ“ {TITLE}")
23
- sidebar.markdown(f"""
24
- # πŸ“ {TITLE}
25
-
26
- A basic web application to display a dashboard for
27
- analyzing INCEpTION annotation project built in context
28
- of NER4Archives (Inria/Archives nationales).
29
-
30
- - This tool provides two statistics levels:
31
- - *Global project statistics*: Analyze named entities in overall curated documents in project;
32
- - *Inter-Annotator Agreement results*: Analyze results of IAA experiment.
33
- """)
 
 
 
 
 
 
 
 
 
34
 
35
  # Level to analyze
36
- option = sidebar.selectbox('Which statistics level?', ('Inter-Annotator Agreement results', 'Global project statistics'))
 
37
 
38
  # IAA results view
39
  if option == "Inter-Annotator Agreement results":
40
  annotations = sidebar.file_uploader("Upload IAA annotations (.zip format only): ")
41
  baseline_text = sidebar.file_uploader("Upload baseline text (.txt format only): ")
42
 
 
43
  if baseline_text is not None and annotations is not None:
44
  project_analyzed = IaaStatistics(zip_project=annotations, baseline_text=baseline_text.getvalue())
45
  baseline_analyzer = project_analyzed.analyze_text()
@@ -308,24 +317,135 @@ if option == "Inter-Annotator Agreement results":
308
  st.pyplot(f.figure)
309
 
310
  # global project results view
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
 
313
- # to st components
314
- def clear_cache():
315
- st.session_state["p_a"] = None
316
 
317
  if option == "Global project statistics":
318
- project = sidebar.file_uploader("Project folder that contains curated annotations in XMI 1.1 (.zip format only) : ", on_change=clear_cache)
319
- if project is not None:
320
- if st.session_state["p_a"] is None:
321
- st.session_state["p_a"] = GlobalStatistics(zip_project=project)
322
- if st.session_state["p_a"] is not None:
323
- with st.expander('Details on data'):
324
- col1.metric("Total curated annotations",
325
- f"{st.session_state['p_a'].total_annotations_project} Named entities")
326
- col1.dataframe(st.session_state['p_a'].df_i)
327
- selected_data = col1.selectbox('Select specific data to display bar plot:',
328
- st.session_state['p_a'].documents)
329
- col2.pyplot(st.session_state['p_a'].create_plot(selected_data))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
 
 
1
  #!/usr/bin/env python3
2
  # -*- coding:utf-8 -*-
3
+ import requests.exceptions
4
+ import zipfile
5
 
6
  import streamlit as st
7
  from streamlit.components.v1 import html
8
  from n4a_analytics_lib.analytics import (GlobalStatistics, IaaStatistics)
9
+ from n4a_analytics_lib.constants import (DESCRIPTION)
10
 
11
 
 
 
12
  # Set application
13
  st.set_page_config(layout="wide")
 
14
  # sidebar: meta, inputs etc.
15
  sidebar = st.sidebar
16
  # cols: display results
17
  col1, col2 = st.columns(2)
18
 
19
  # description
20
+ sidebar.markdown(DESCRIPTION)
21
+
22
+
23
+
24
+
25
+ # to st components
26
+ #def clear_cache():
27
+ # st.session_state = {}
28
+
29
+ def check_login(username, password):
30
+ if (len(username) == 0) or (len(password) == 0):
31
+ return False
32
+ return True
33
+
34
+ def logout():
35
+ pass
36
+
37
+
38
+
39
+
40
+
41
 
42
  # Level to analyze
43
+ option = sidebar.selectbox('Which statistics level?', ('Inter-Annotator Agreement results',
44
+ 'Global project statistics'))
45
 
46
  # IAA results view
47
  if option == "Inter-Annotator Agreement results":
48
  annotations = sidebar.file_uploader("Upload IAA annotations (.zip format only): ")
49
  baseline_text = sidebar.file_uploader("Upload baseline text (.txt format only): ")
50
 
51
+
52
  if baseline_text is not None and annotations is not None:
53
  project_analyzed = IaaStatistics(zip_project=annotations, baseline_text=baseline_text.getvalue())
54
  baseline_analyzer = project_analyzed.analyze_text()
 
317
  st.pyplot(f.figure)
318
 
319
  # global project results view
320
+ # st_session = {"gs_local":True, "gs_remote":False, "gs_obj":<object>}
321
+
322
+ def display_data():
323
+ col1.metric("Total curated annotations",
324
+ f"{st.session_state['gs_obj'].total_annotations_project} Named entities")
325
+ col1.dataframe(st.session_state['gs_obj'].df_i)
326
+ selected_data = col1.selectbox('Select specific data to display bar plot:',
327
+ st.session_state['gs_obj'].documents, key="selector_data")
328
+ col2.pyplot(st.session_state['gs_obj'].create_plot(selected_data))
329
+
330
+ def init_session_statistics(remote: bool, local: bool, data: tuple) -> None:
331
+ # clear session
332
+ st.session_state = {}
333
+
334
+ # create a session variable
335
+ st.session_state["gs_local"] = local
336
+ st.session_state["gs_remote"] = remote
337
+
338
+ # create a new object:
339
+ # if remote fetch data from API Host first
340
+ if remote and not(local):
341
+ st.success('Fetch curated documents from host INCEpTION API in progress...')
342
+ fetch_curated_data_from_remote(
343
+ username=data[0],
344
+ password=data[1]
345
+ )
346
+
347
+ if local and not(remote):
348
+ st.session_state["gs_obj"] = GlobalStatistics(zip_project=data, remote=False)
349
+
350
+
351
+
352
+
353
+
354
+ from pycaprio import Pycaprio, mappings
355
+ from zipfile import ZipFile
356
+ import io
357
+ import requests
358
+
359
+ def fetch_curated_data_from_remote(username: str,
360
+ password: str,
361
+ endpoint: str = "https://inception.dhlab.epfl.ch/prod",
362
+ project_title: str = "ner4archives-template"):
363
+ # open a client
364
+ try:
365
+ client = Pycaprio(inception_host=endpoint, authentication=(str(username), str(password)))
366
+ except requests.exceptions.JSONDecodeError:
367
+ # username / password incorrect
368
+ st.error('Username or Password is incorrect please retry.')
369
+
370
+ # get project object
371
+ project_name = [p for p in client.api.projects() if p.project_name == project_title]
372
+
373
+ # get all documents from project
374
+ documents = client.api.documents(project_name[0].project_id)
375
+
376
+ curations = []
377
+ zipfiles = []
378
+ count = 0
379
+ flag = "a"
380
+ # iterate over all documents and retrieve only curated into ZIP container
381
+ for document in documents:
382
+ if count > 0:
383
+ flag = "r"
384
+ if document.document_state == mappings.DocumentState.CURATION_COMPLETE:
385
+ curated_content = client.api.curation(project_name[0].project_id, document,
386
+ curation_format=mappings.InceptionFormat.UIMA_CAS_XMI_XML_1_1)
387
+ curations.append(curated_content)
388
+ for curation in curations:
389
+ z = ZipFile(io.BytesIO(curation), mode=flag)
390
+ zipfiles.append(z)
391
+
392
+ count += 1
393
+
394
+ # Merge all zip in one
395
+ with zipfiles[0] as z1:
396
+ for fname in zipfiles[1:]:
397
+ zf = fname
398
+ # print(zf.namelist())
399
+ for n in zf.namelist():
400
+ if n not in z1.namelist():
401
+ z1.writestr(n, zf.open(n).read())
402
+
403
+ # Create a new object
404
+ st.session_state["gs_obj"] = GlobalStatistics(zip_project=z1, remote=True)
405
+
406
 
407
 
 
 
 
408
 
409
  if option == "Global project statistics":
410
+ # User input controllers
411
+ mode = sidebar.radio("Choose mode to retrieve curated data: ", (
412
+ "Local directory", "INCEpTION API Host remote"
413
+ ))
414
+ data = None
415
+ if mode == "Local directory":
416
+ project = sidebar.file_uploader("Folder that contains curated annotations in XMI 1.1 (.zip format only): ",
417
+ type="zip")
418
+ data = project
419
+ if mode == "INCEpTION API Host remote":
420
+ username = sidebar.text_input("Username: ")
421
+ password = sidebar.text_input("Password: ", type="password")
422
+ data = (username, password)
423
+
424
+ # Validate inputs
425
+ btn_process = sidebar.button('Process', key='process')
426
+
427
+ # Access data with local ressources
428
+ if btn_process and mode == "Local directory":
429
+ if data is not None:
430
+ # create a new session
431
+ init_session_statistics(remote=False, local=True, data=data)
432
+
433
+ # Access data with remote ressources
434
+ if btn_process and mode == "API Host remote":
435
+ if data is not None:
436
+ if check_login(username=data[0], password=data[1]):
437
+ # create a new session
438
+ init_session_statistics(remote=True, local=False, data=data)
439
+ else:
440
+ st.error("Sorry! Username or Password is empty.")
441
+
442
+ # Change data values and visualize new plot
443
+ if "gs_obj" in st.session_state:
444
+ if st.session_state["gs_local"] or st.session_state["gs_remote"]:
445
+ display_data()
446
+
447
+
448
+
449
+
450
 
451
 
n4a_analytics_lib/__pycache__/__init__.cpython-38.pyc CHANGED
Binary files a/n4a_analytics_lib/__pycache__/__init__.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/__init__.cpython-38.pyc differ
 
n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc CHANGED
Binary files a/n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/analytics.cpython-38.pyc differ
 
n4a_analytics_lib/__pycache__/project.cpython-38.pyc CHANGED
Binary files a/n4a_analytics_lib/__pycache__/project.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/project.cpython-38.pyc differ
 
n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc CHANGED
Binary files a/n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc and b/n4a_analytics_lib/__pycache__/st_components.cpython-38.pyc differ
 
n4a_analytics_lib/analytics.py CHANGED
@@ -15,9 +15,8 @@ from n4a_analytics_lib.project import Project
15
 
16
 
17
  class GlobalStatistics(Project):
18
- def __init__(self, zip_project):
19
- super().__init__(zip_project=zip_project, type="global")
20
-
21
  self.data = [(src_file, ne_label) for src_file, ann in self.annotations.items() for ne_label in ann['labels']]
22
  self.df_base = pd.DataFrame(self.data, columns=["SOURCE_FILE", "LABEL"])
23
  self.df_i = self.df_base.groupby(["LABEL"])["LABEL"].count().reset_index(name="TOTAL")
 
15
 
16
 
17
  class GlobalStatistics(Project):
18
+ def __init__(self, zip_project, remote=False):
19
+ super().__init__(zip_project=zip_project, remote=remote, type="global")
 
20
  self.data = [(src_file, ne_label) for src_file, ann in self.annotations.items() for ne_label in ann['labels']]
21
  self.df_base = pd.DataFrame(self.data, columns=["SOURCE_FILE", "LABEL"])
22
  self.df_i = self.df_base.groupby(["LABEL"])["LABEL"].count().reset_index(name="TOTAL")
n4a_analytics_lib/constants.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding:utf-8 -*-
2
+
3
+ TITLE = "NER4ARCHIVES Analytics"
4
+ DESCRIPTION = f"""
5
+ # πŸ“ {TITLE}
6
+
7
+ A basic web application to display a dashboard for
8
+ analyzing INCEpTION annotation project built in context
9
+ of NER4Archives (Inria/Archives nationales).
10
+
11
+ - This tool provides two statistics levels:
12
+ - *Global project statistics*: Analyze named entities in overall curated documents in project;
13
+ - *Inter-Annotator Agreement results*: Analyze results of IAA experiment.
14
+ """
n4a_analytics_lib/project.py CHANGED
@@ -1,5 +1,5 @@
1
  # -*- coding:utf-8 -*-
2
-
3
  from io import BytesIO
4
  import re
5
  from zipfile import ZipFile
@@ -13,9 +13,12 @@ from n4a_analytics_lib.st_components import st_pb
13
 
14
 
15
  class Project:
16
- def __init__(self, zip_project, type):
17
  # zip container that contains XMI and typesystem
18
  self.zip_project = zip_project
 
 
 
19
  # 'iaa' or 'global'
20
  self.type = type
21
 
@@ -42,36 +45,46 @@ class Project:
42
  self.annotations = {}
43
 
44
 
45
- with ZipFile(self.zip_project) as project_zip:
46
- if self.type == "global":
47
- regex = re.compile('.*curation/.*/(?!\._).*zip$')
48
- elif self.type == "iaa":
49
- regex = re.compile('.*xm[il]$')
50
-
51
- annotation_fps = (fp for fp in project_zip.namelist() if regex.match(fp))
52
- for fp in annotation_fps:
53
- if self.type == "global":
54
- with ZipFile(BytesIO(project_zip.read(fp))) as annotation_zip:
55
- if self.typesystem is None:
56
- self.typesystem = load_typesystem(BytesIO(annotation_zip.read('TypeSystem.xml')))
57
- for f in annotation_zip.namelist():
58
- if f.endswith('.xmi'):
59
- # store source filename
60
- self.documents.append(Path(fp).parent.name)
61
- # annotators = []
62
- # store XMI representation
63
- self.xmi_documents.append(str(annotation_zip.read(f).decode("utf-8")))
64
- elif self.type == "iaa":
65
- if self.typesystem is None and fp.endswith('.xml'):
66
- self.typesystem = load_typesystem(BytesIO(project_zip.read('TypeSystem.xml')))
67
- else:
68
- if fp.endswith('.xmi'):
69
- # store source filename
70
- self.documents.append(fp)
71
- # set annotators
72
- self.annotators.append(os.path.splitext(fp)[0])
73
- # store XMI representation
74
- self.xmi_documents.append(str(project_zip.read(fp).decode("utf-8")))
 
 
 
 
 
 
 
 
 
 
75
 
76
 
77
  self.extract_ne()
 
1
  # -*- coding:utf-8 -*-
2
+ import zipfile
3
  from io import BytesIO
4
  import re
5
  from zipfile import ZipFile
 
13
 
14
 
15
  class Project:
16
+ def __init__(self, zip_project, type, remote):
17
  # zip container that contains XMI and typesystem
18
  self.zip_project = zip_project
19
+
20
+ self.remote = remote
21
+
22
  # 'iaa' or 'global'
23
  self.type = type
24
 
 
45
  self.annotations = {}
46
 
47
 
48
+ if isinstance(self.zip_project, zipfile.ZipFile) and self.remote and self.type == "global":
49
+ for fp in self.zip_project.namelist():
50
+ if self.typesystem is None:
51
+ self.typesystem = load_typesystem(BytesIO(self.zip_project.open('TypeSystem.xml').read()))
52
+ if fp.endswith('.xmi'):
53
+ self.documents.append(fp)
54
+ self.xmi_documents.append(str(self.zip_project.open(fp).read().decode("utf-8")))
55
+
56
+
57
+ else:
58
+ with ZipFile(self.zip_project) as project_zip:
59
+ if self.type == "global":
60
+ regex = re.compile('.*curation/.*/(?!\._).*zip$')
61
+ elif self.type == "iaa":
62
+ regex = re.compile('.*xm[il]$')
63
+
64
+ annotation_fps = (fp for fp in project_zip.namelist() if regex.match(fp))
65
+ for fp in annotation_fps:
66
+ if self.type == "global":
67
+ with ZipFile(BytesIO(project_zip.read(fp))) as annotation_zip:
68
+ if self.typesystem is None:
69
+ self.typesystem = load_typesystem(BytesIO(annotation_zip.read('TypeSystem.xml')))
70
+ for f in annotation_zip.namelist():
71
+ if f.endswith('.xmi'):
72
+ # store source filename
73
+ self.documents.append(Path(fp).parent.name)
74
+ # annotators = []
75
+ # store XMI representation
76
+ self.xmi_documents.append(str(annotation_zip.read(f).decode("utf-8")))
77
+ elif self.type == "iaa":
78
+ if self.typesystem is None and fp.endswith('.xml'):
79
+ self.typesystem = load_typesystem(BytesIO(project_zip.read('TypeSystem.xml')))
80
+ else:
81
+ if fp.endswith('.xmi'):
82
+ # store source filename
83
+ self.documents.append(fp)
84
+ # set annotators
85
+ self.annotators.append(os.path.splitext(fp)[0])
86
+ # store XMI representation
87
+ self.xmi_documents.append(str(project_zip.read(fp).decode("utf-8")))
88
 
89
 
90
  self.extract_ne()