# -*- coding:utf-8 -*- import pandas as pd import seaborn as sns import matplotlib matplotlib.use('Agg') import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize, word_tokenize from n4a_analytics_lib.project import Project class GlobalStatistics(Project): def __init__(self, zip_project, remote=False): super().__init__(zip_project=zip_project, remote=remote, type="global") self.data = [(src_file, ne_label) for src_file, ann in self.annotations.items() for ne_label in ann['labels']] self.df_base = pd.DataFrame(self.data, columns=["SOURCE_FILE", "LABEL"]) self.df_i = self.df_base.groupby(["LABEL"])["LABEL"].count().reset_index(name="TOTAL") self.df_details = self.df_base.groupby(["SOURCE_FILE", "LABEL"])["LABEL"].count().reset_index(name="TOTAL") self.total_annotations_project = self.df_i['TOTAL'].sum() def create_plot(self, type_data): # apply data filter data_tab_filtered = self.df_details.loc[self.df_details['SOURCE_FILE'] == type_data] # create a new plot ax = sns.barplot(x='LABEL', y='TOTAL', data=data_tab_filtered) # add title to plot ax.figure.suptitle(type_data) # add value labels to bars for container in ax.containers: ax.bar_label(container) return ax.figure class IaaStatistics(Project): def __init__(self, zip_project, baseline_text): super().__init__(zip_project=zip_project, type="iaa") self.baseline_text = baseline_text.decode('utf-8') # self.docs = {} # self.pairwise = {} # self.similar_mention = [] self.mentions_per_coder = self.extract_refs(self.annotations, self.annotators, type="mentions") self.labels_per_coder = self.extract_refs(self.annotations, self.annotators, type="labels") self.annotations_per_coders = {coder: dict(zip(ann[1]['mentions'], ann[1]['labels'])) for coder, ann in zip(self.annotators, self.annotations.items())} @staticmethod def extract_refs(annotations, annotators, type): return { coder: data for coder, ann in zip( annotators, annotations.items() ) for ref, data in ann[1].items() if ref == type } def analyze_text(self): """returns total sentences, words and characters in list format """ return [ len(sent_tokenize(self.baseline_text, language="french")), len(word_tokenize(self.baseline_text, language="french")), len(self.baseline_text) ]