# -*- coding:utf-8 -*- import io import pandas import streamlit as st from pycaprio import Pycaprio, mappings from zipfile import ZipFile from requests.exceptions import JSONDecodeError from n4a_analytics_lib.analytics import (GlobalStatistics, IaaStatistics) from n4a_analytics_lib.constants import KAPPA_LEGEND @st.cache def convert_df(df_ex: pandas.DataFrame) -> bytes: return df_ex.to_csv(encoding="utf-8").encode('utf-8') def check_login(username: str, password: str) -> bool: if (len(username) == 0) or (len(password) == 0): return False return True def display_data(col: st.columns) -> None: col.metric("Total curated annotations", f"{st.session_state['gs_obj'].total_annotations_project} Named entities") col.dataframe(st.session_state['gs_obj'].df_i) selected_data = col.selectbox('Select specific data to display bar plot:', st.session_state['gs_obj'].documents, key="selector_data") col.pyplot(st.session_state['gs_obj'].create_plot(selected_data)) def template_agreement_dataframe(title: str, df: pandas.DataFrame, total_pov: int, total_annotations: int, percentage_pov: float, mode: str) -> None: st.subheader(title) st.markdown(f"{total_pov} / {total_annotations} annotations ({percentage_pov} %)") st.download_button( "Press to Download CSV", convert_df(df), f"csv_annotators_{mode}.csv", "text/csv", key=f'download-csv_{mode}' ) st.dataframe(df) def init_session_iaa(data: st.file_uploader, baseline: st.file_uploader, col: st.columns) -> None: project_analyzed = IaaStatistics(zip_project=data, baseline_text=baseline.getvalue()) baseline_analyzer = project_analyzed.analyze_text() col.markdown(f""" ### BASELINE TEXT: {baseline.name} - sentences: {baseline_analyzer[0]} - words: {baseline_analyzer[1]} - characters: {baseline_analyzer[2]} """) st.markdown("## 📈 IAA metrics") col1_kappa, col2_kappa = st.columns(2) # Display Kappa group col1_kappa.subheader("Fleiss Kappa (global score for group):") col1_kappa.markdown(interpret_kappa(project_analyzed.fleiss_kappa), unsafe_allow_html=True) # Display pairs kappa col1_kappa.subheader("Cohen Kappa (score for annotators pair):") for coders, c_k in project_analyzed.compute_pairs_cohen_kappa().items(): col1_kappa.markdown(f"* {coders[0]} <> {coders[1]} : {interpret_kappa(c_k)}", unsafe_allow_html=True) # Display Kappa legend col2_kappa.markdown(KAPPA_LEGEND, unsafe_allow_html=True) # Plot confusion matrix if st.checkbox('Display confusion matrix'): width = st.slider("matrix width", 1, 10, 14) height = st.slider("matrix height", 1, 10, 4) st.pyplot(project_analyzed.plot_confusion_matrix(width=width, height=height).figure) # Agree CSV template_agreement_dataframe(title="✅️ Agree annotations", df=project_analyzed.df_agree, total_pov=project_analyzed.total_agree, total_annotations=project_analyzed.total_annotations, percentage_pov=project_analyzed.percent_agree, mode="agree") # Disagree CSV template_agreement_dataframe(title="❌ Disagree annotations", df=project_analyzed.df_disagree, total_pov=project_analyzed.total_disagree, total_annotations=project_analyzed.total_annotations, percentage_pov=project_analyzed.percent_disagree, mode="disagree") # Pie plot st.subheader("🏷️ Global Labels Statistics") st.pyplot(project_analyzed.plot_agreement_pies().figure) def init_session_statistics(remote: bool, local: bool, data: tuple) -> None: # clear session st.session_state = {} # create a session variable st.session_state["gs_local"] = local st.session_state["gs_remote"] = remote # create a new object: # if remote fetch data from API Host first if remote and not(local): st.success('Fetch curated documents from host INCEpTION API in progress...') try: fetch_curated_data_from_remote( username=data[0], password=data[1] ) except JSONDecodeError: # username / password incorrect st.error('Username or Password is incorrect please retry.') st.session_state = {} if local and not(remote): st.session_state["gs_obj"] = GlobalStatistics(zip_project=data, remote=False) def fetch_curated_data_from_remote(username: str, password: str, endpoint: str = "https://inception.dhlab.epfl.ch/prod", project_title: str = "ner4archives-template") -> None: # open a client client = Pycaprio(inception_host=endpoint, authentication=(str(username), str(password))) # get project object project_name = [p for p in client.api.projects() if p.project_name == project_title] # get all documents from project documents = client.api.documents(project_name[0].project_id) curations = [] zipfiles = [] count = 0 flag = "a" # iterate over all documents and retrieve only curated into ZIP container for document in documents: if count > 0: flag = "r" if document.document_state == mappings.DocumentState.CURATION_COMPLETE: curated_content = client.api.curation(project_name[0].project_id, document, curation_format=mappings.InceptionFormat.UIMA_CAS_XMI_XML_1_1) curations.append(curated_content) for curation in curations: z = ZipFile(io.BytesIO(curation), mode=flag) zipfiles.append(z) count += 1 # Merge all zip in one with zipfiles[0] as z1: for fname in zipfiles[1:]: zf = fname # print(zf.namelist()) for n in zf.namelist(): if n not in z1.namelist(): z1.writestr(n, zf.open(n).read()) # Create a new object st.session_state["gs_obj"] = GlobalStatistics(zip_project=z1, remote=True) def interpret_kappa(score: float) -> str: color = "" if score < 0: color= "#e74c3c;" elif 0.01 <= score <= 0.20: color= "#f39c12;" elif 0.21 <= score <= 0.40: color= "#f4d03f;" elif 0.41 <= score <= 0.60: color= "#5dade2;" elif 0.61 <= score <= 0.80: color= "#58d68d;" elif 0.81 <= score <= 0.99: color= "#28b463;" return f"{round(score*100, 2)} %"