Updating
Browse files- app.py +33 -9
- appStore/adapmit.py +76 -114
- appStore/doc_processing.py +77 -0
- appStore/ghg.py +95 -0
- appStore/info.py +0 -67
- appStore/multiapp.py +0 -67
- appStore/netzero.py +34 -145
- appStore/sector.py +53 -96
- appStore/target.py +62 -162
- paramconfig.cfg +21 -1
- utils/adapmit_classifier.py +18 -53
- utils/ghg_classifier.py +90 -0
- utils/netzero_classifier.py +16 -63
- utils/preprocessing.py +26 -10
- utils/sector_classifier.py +20 -58
- utils/target_classifier.py +16 -63
- utils/uploadAndExample.py +6 -0
app.py
CHANGED
@@ -2,19 +2,43 @@ import appStore.target as target_extraction
|
|
2 |
import appStore.netzero as netzero
|
3 |
import appStore.sector as sector
|
4 |
import appStore.adapmit as adapmit
|
5 |
-
|
6 |
-
|
|
|
7 |
import streamlit as st
|
8 |
|
9 |
st.set_page_config(page_title = 'Climate Policy Intelligence',
|
10 |
initial_sidebar_state='expanded', layout="wide")
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
app.add_app("Sector Classification","gear", sector.app)
|
18 |
-
app.add_app("Adaptation-Mitigation","gear", adapmit.app)
|
19 |
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import appStore.netzero as netzero
|
3 |
import appStore.sector as sector
|
4 |
import appStore.adapmit as adapmit
|
5 |
+
import appStore.ghg as ghg
|
6 |
+
import appStore.doc_processing as processing
|
7 |
+
from utils.uploadAndExample import add_upload
|
8 |
import streamlit as st
|
9 |
|
10 |
st.set_page_config(page_title = 'Climate Policy Intelligence',
|
11 |
initial_sidebar_state='expanded', layout="wide")
|
12 |
|
13 |
+
with st.sidebar:
|
14 |
+
# upload and example doc
|
15 |
+
choice = st.sidebar.radio(label = 'Select the Document',
|
16 |
+
help = 'You can upload the document \
|
17 |
+
or else you can try a example document',
|
18 |
+
options = ('Upload Document', 'Try Example'),
|
19 |
+
horizontal = True)
|
20 |
+
add_upload(choice)
|
21 |
|
22 |
+
with st.container():
|
23 |
+
st.markdown("<h2 style='text-align: center; color: black;'> Climate Policy Intelligence App </h2>", unsafe_allow_html=True)
|
24 |
+
st.write(' ')
|
|
|
|
|
25 |
|
26 |
+
# with st.expander("ℹ️ - About this app", expanded=False):
|
27 |
+
# st.write(
|
28 |
+
# """
|
29 |
+
# Climate Policy Understanding App is an open-source\
|
30 |
+
# digital tool which aims to assist policy analysts and \
|
31 |
+
# other users in extracting and filtering relevant \
|
32 |
+
# information from public documents.
|
33 |
+
# """)
|
34 |
+
# st.write("")
|
35 |
+
apps = [processing.app, target_extraction.app, netzero.app, ghg.app,
|
36 |
+
sector.app, adapmit.app]
|
37 |
+
multiplier_val = int(100/len(apps))
|
38 |
+
if st.button("Get the work done"):
|
39 |
+
prg = st.progress(0)
|
40 |
+
for i,func in enumerate(apps):
|
41 |
+
func()
|
42 |
+
prg.progress((i+1)*multiplier_val)
|
43 |
+
if 'key1' in st.session_state:
|
44 |
+
st.write(st.session_state.key1)
|
appStore/adapmit.py
CHANGED
@@ -8,10 +8,7 @@ import matplotlib.pyplot as plt
|
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
10 |
import streamlit as st
|
11 |
-
|
12 |
-
# from st_aggrid.shared import ColumnsAutoSizeMode
|
13 |
-
from utils.adapmit_classifier import adapmit_classification
|
14 |
-
from utils.adapmit_classifier import runAdapMitPreprocessingPipeline, load_adapmitClassifier
|
15 |
# from utils.keyword_extraction import textrank
|
16 |
import logging
|
17 |
logger = logging.getLogger(__name__)
|
@@ -48,122 +45,87 @@ def to_excel(df):
|
|
48 |
|
49 |
def app():
|
50 |
|
51 |
-
#### APP INFO #####
|
52 |
-
with st.container():
|
53 |
-
st.markdown("<h1 style='text-align: center; color: black;'> Adaptation-Mitigation Classification </h1>", unsafe_allow_html=True)
|
54 |
-
st.write(' ')
|
55 |
-
st.write(' ')
|
56 |
-
|
57 |
-
with st.expander("ℹ️ - About this app", expanded=False):
|
58 |
-
|
59 |
-
st.write(
|
60 |
-
"""
|
61 |
-
The **Adaptation-Mitigation Classification** app is an easy-to-use interface built \
|
62 |
-
in Streamlit for analyzing policy documents for \
|
63 |
-
Classification of the paragraphs/texts in the document *If it \
|
64 |
-
belongs to 'Adaptation' and 'Mitigation' category or not. The paragraph \
|
65 |
-
can belong to both category too. \
|
66 |
-
- developed by GIZ Data Service Center, GFA, IKI Tracs, \
|
67 |
-
SV Klima and SPA. \n
|
68 |
-
""")
|
69 |
-
st.write("""**Document Processing:** The Uploaded/Selected document is \
|
70 |
-
automatically cleaned and split into paragraphs with a maximum \
|
71 |
-
length of 60 words using a Haystack preprocessing pipeline. The \
|
72 |
-
length of 60 is an empirical value which should reflect the length \
|
73 |
-
of a “context” and should limit the paragraph length deviation. \
|
74 |
-
However, since we want to respect the sentence boundary the limit \
|
75 |
-
can breach and hence this limit of 60 is tentative. \n
|
76 |
-
""")
|
77 |
-
|
78 |
-
st.write("")
|
79 |
-
|
80 |
### Main app code ###
|
81 |
with st.container():
|
82 |
-
if st.button("RUN Adaptation-Mitigation Classification"):
|
83 |
-
if 'key4' not in st.session_state:
|
84 |
-
st.session_state['key4'] = None
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
#
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
# st.write("Total Paragraphs: {}".format(len(df)))
|
127 |
-
fig = px.bar(count_df, y='category', x='count',
|
128 |
-
color='truth_value',orientation='h', height =200)
|
129 |
-
c1, c2 = st.columns([1,1])
|
130 |
-
with c1:
|
131 |
-
st.plotly_chart(fig,use_container_width= True)
|
132 |
-
|
133 |
-
truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
|
134 |
-
truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
|
135 |
-
# st.write(truth_df)
|
136 |
-
df = pd.concat([df,truth_df['labels']],axis=1)
|
137 |
-
st.markdown("###### Top few 'Mitigation' related paragraph/text ######")
|
138 |
-
df = df.sort_values(by = ['Mitigation'], ascending=False)
|
139 |
-
for i in range(3):
|
140 |
-
if df.iloc[i]['Mitigation'] >= 0.50:
|
141 |
-
st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Mitigation']))
|
142 |
-
st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
|
143 |
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
else:
|
165 |
-
|
166 |
-
|
167 |
|
168 |
# # Creating truth value dataframe
|
169 |
# if 'key4' in st.session_state:
|
|
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
10 |
import streamlit as st
|
11 |
+
from utils.adapmit_classifier import load_adapmitClassifier,adapmit_classification
|
|
|
|
|
|
|
12 |
# from utils.keyword_extraction import textrank
|
13 |
import logging
|
14 |
logger = logging.getLogger(__name__)
|
|
|
45 |
|
46 |
def app():
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
### Main app code ###
|
49 |
with st.container():
|
|
|
|
|
|
|
50 |
|
51 |
+
if 'key1' in st.session_state:
|
52 |
+
df = st.session_state.key1
|
53 |
+
|
54 |
+
classifier = load_adapmitClassifier(classifier_name=params['model_name'])
|
55 |
+
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
56 |
+
if sum(df['Target Label'] == 'TARGET') > 100:
|
57 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
58 |
+
else:
|
59 |
+
warning_msg = ""
|
60 |
+
|
61 |
+
df = adapmit_classification(haystack_doc=df,
|
62 |
+
threshold= params['threshold'])
|
63 |
+
|
64 |
+
st.session_state.key1 = df
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
|
69 |
|
70 |
+
# threshold= params['threshold']
|
71 |
+
# truth_df = df.drop(['text'],axis=1)
|
72 |
+
# truth_df = truth_df.astype(float) >= threshold
|
73 |
+
# truth_df = truth_df.astype(str)
|
74 |
+
# categories = list(truth_df.columns)
|
75 |
+
|
76 |
+
# placeholder = {}
|
77 |
+
# for val in categories:
|
78 |
+
# placeholder[val] = dict(truth_df[val].value_counts())
|
79 |
+
# count_df = pd.DataFrame.from_dict(placeholder)
|
80 |
+
# count_df = count_df.T
|
81 |
+
# count_df = count_df.reset_index()
|
82 |
+
# # st.write(count_df)
|
83 |
+
# placeholder = []
|
84 |
+
# for i in range(len(count_df)):
|
85 |
+
# placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
|
86 |
+
# placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
|
87 |
+
# count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
|
88 |
+
# # st.write("Total Paragraphs: {}".format(len(df)))
|
89 |
+
# fig = px.bar(count_df, y='category', x='count',
|
90 |
+
# color='truth_value',orientation='h', height =200)
|
91 |
+
# c1, c2 = st.columns([1,1])
|
92 |
+
# with c1:
|
93 |
+
# st.plotly_chart(fig,use_container_width= True)
|
94 |
+
|
95 |
+
# truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
|
96 |
+
# truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
|
97 |
+
# # st.write(truth_df)
|
98 |
+
# df = pd.concat([df,truth_df['labels']],axis=1)
|
99 |
+
# st.markdown("###### Top few 'Mitigation' related paragraph/text ######")
|
100 |
+
# df = df.sort_values(by = ['Mitigation'], ascending=False)
|
101 |
+
# for i in range(3):
|
102 |
+
# if df.iloc[i]['Mitigation'] >= 0.50:
|
103 |
+
# st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Mitigation']))
|
104 |
+
# st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
+
# st.markdown("###### Top few 'Adaptation' related paragraph/text ######")
|
107 |
+
# df = df.sort_values(by = ['Adaptation'], ascending=False)
|
108 |
+
# for i in range(3):
|
109 |
+
# if df.iloc[i]['Adaptation'] > 0.5:
|
110 |
+
# st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Adaptation']))
|
111 |
+
# st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
|
112 |
+
# # st.write(df[['text','labels']])
|
113 |
+
# df['Validation'] = 'No'
|
114 |
+
# df['Val-Mitigation'] = 'No'
|
115 |
+
# df['Val-Adaptation'] = 'No'
|
116 |
+
# df_xlsx = to_excel(df)
|
117 |
+
# st.download_button(label='📥 Download Current Result',
|
118 |
+
# data=df_xlsx ,
|
119 |
+
# file_name= 'file_adaptation-mitigation.xlsx')
|
120 |
+
# # st.session_state.key4 =
|
121 |
+
|
122 |
+
# # category =set(df.columns)
|
123 |
+
# # removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
|
124 |
+
# # category = list(category - removecols)
|
125 |
+
|
126 |
+
# else:
|
127 |
+
# st.info("🤔 No document found, please try to upload it at the sidebar!")
|
128 |
+
# logging.warning("Terminated as no document provided")
|
129 |
|
130 |
# # Creating truth value dataframe
|
131 |
# if 'key4' in st.session_state:
|
appStore/doc_processing.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys;
|
3 |
+
sys.path.append('../utils')
|
4 |
+
from typing import List, Tuple
|
5 |
+
from typing_extensions import Literal
|
6 |
+
from haystack.schema import Document
|
7 |
+
from utils.config import get_classifier_params
|
8 |
+
from utils.preprocessing import processingpipeline,paraLengthCheck
|
9 |
+
import streamlit as st
|
10 |
+
import logging
|
11 |
+
import pandas as pd
|
12 |
+
params = get_classifier_params("preprocessing")
|
13 |
+
|
14 |
+
@st.cache_data
|
15 |
+
def runPreprocessingPipeline(file_name:str, file_path:str,
|
16 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
17 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
18 |
+
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
19 |
+
"""
|
20 |
+
creates the pipeline and runs the preprocessing pipeline,
|
21 |
+
the params for pipeline are fetched from paramconfig
|
22 |
+
Params
|
23 |
+
------------
|
24 |
+
file_name: filename, in case of streamlit application use
|
25 |
+
st.session_state['filename']
|
26 |
+
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
27 |
+
split_by: document splitting strategy either as word or sentence
|
28 |
+
split_length: when synthetically creating the paragrpahs from document,
|
29 |
+
it defines the length of paragraph.
|
30 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
31 |
+
splititng of text.
|
32 |
+
split_overlap: Number of words or sentences that overlap when creating
|
33 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
34 |
+
when read in together with others. Therefore the overlap is used.
|
35 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
36 |
+
Return
|
37 |
+
--------------
|
38 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
39 |
+
has four objects. For the Haysatck implementation of SDG classification we,
|
40 |
+
need to use the List of Haystack Document, which can be fetched by
|
41 |
+
key = 'documents' on output.
|
42 |
+
"""
|
43 |
+
|
44 |
+
processing_pipeline = processingpipeline()
|
45 |
+
|
46 |
+
output_pre = processing_pipeline.run(file_paths = file_path,
|
47 |
+
params= {"FileConverter": {"file_path": file_path, \
|
48 |
+
"file_name": file_name},
|
49 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
50 |
+
"split_by": split_by, \
|
51 |
+
"split_length":split_length,\
|
52 |
+
"split_overlap": split_overlap, \
|
53 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
54 |
+
|
55 |
+
return output_pre
|
56 |
+
|
57 |
+
|
58 |
+
def app():
|
59 |
+
with st.container():
|
60 |
+
if 'filepath' in st.session_state:
|
61 |
+
file_name = st.session_state['filename']
|
62 |
+
file_path = st.session_state['filepath']
|
63 |
+
|
64 |
+
|
65 |
+
all_documents = runPreprocessingPipeline(file_name= file_name,
|
66 |
+
file_path= file_path, split_by= params['split_by'],
|
67 |
+
split_length= params['split_length'],
|
68 |
+
split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
|
69 |
+
split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
|
70 |
+
paralist = paraLengthCheck(all_documents['documents'], 100)
|
71 |
+
df = pd.DataFrame(paralist,columns = ['text','page'])
|
72 |
+
# saving the dataframe to session state
|
73 |
+
st.session_state['key0'] = df
|
74 |
+
|
75 |
+
else:
|
76 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
77 |
+
logging.warning("Terminated as no document provided")
|
appStore/ghg.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys;
|
3 |
+
sys.path.append('../utils')
|
4 |
+
|
5 |
+
#import needed libraries
|
6 |
+
import seaborn as sns
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
import streamlit as st
|
11 |
+
from utils.ghg_classifier import load_ghgClassifier, ghg_classification
|
12 |
+
import logging
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
from utils.config import get_classifier_params
|
15 |
+
from io import BytesIO
|
16 |
+
import xlsxwriter
|
17 |
+
import plotly.express as px
|
18 |
+
|
19 |
+
|
20 |
+
# Declare all the necessary variables
|
21 |
+
classifier_identifier = 'ghg'
|
22 |
+
params = get_classifier_params(classifier_identifier)
|
23 |
+
|
24 |
+
# Labels dictionary ###
|
25 |
+
_lab_dict = {
|
26 |
+
'NEGATIVE':'NO GHG TARGET',
|
27 |
+
'NA':'NOT APPLICABLE',
|
28 |
+
'TARGET':'GHG TARGET',
|
29 |
+
}
|
30 |
+
|
31 |
+
|
32 |
+
@st.cache_data
|
33 |
+
def to_excel(df):
|
34 |
+
len_df = len(df)
|
35 |
+
output = BytesIO()
|
36 |
+
writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
37 |
+
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
38 |
+
workbook = writer.book
|
39 |
+
worksheet = writer.sheets['Sheet1']
|
40 |
+
worksheet.data_validation('E2:E{}'.format(len_df),
|
41 |
+
{'validate': 'list',
|
42 |
+
'source': ['No', 'Yes', 'Discard']})
|
43 |
+
writer.save()
|
44 |
+
processed_data = output.getvalue()
|
45 |
+
return processed_data
|
46 |
+
|
47 |
+
def app():
|
48 |
+
### Main app code ###
|
49 |
+
with st.container():
|
50 |
+
if 'key1' in st.session_state:
|
51 |
+
df = st.session_state.key1
|
52 |
+
|
53 |
+
# Load the classifier model
|
54 |
+
classifier = load_ghgClassifier(classifier_name=params['model_name'])
|
55 |
+
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
56 |
+
|
57 |
+
if sum(df['Target Label'] == 'TARGET') > 100:
|
58 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
59 |
+
else:
|
60 |
+
warning_msg = ""
|
61 |
+
|
62 |
+
df = ghg_classification(haystack_doc=df,
|
63 |
+
threshold= params['threshold'])
|
64 |
+
st.session_state.key1 = df
|
65 |
+
|
66 |
+
|
67 |
+
def netzero_display():
|
68 |
+
if 'key1' in st.session_state:
|
69 |
+
df = st.session_state.key2
|
70 |
+
hits = df[df['GHG Label'] == 'TARGET']
|
71 |
+
range_val = min(5,len(hits))
|
72 |
+
if range_val !=0:
|
73 |
+
count_df = df['GHG Label'].value_counts()
|
74 |
+
count_df = count_df.rename('count')
|
75 |
+
count_df = count_df.rename_axis('GHG Label').reset_index()
|
76 |
+
count_df['Label_def'] = count_df['GHG Label'].apply(lambda x: _lab_dict[x])
|
77 |
+
|
78 |
+
fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
|
79 |
+
c1, c2 = st.columns([1,1])
|
80 |
+
with c1:
|
81 |
+
st.plotly_chart(fig,use_container_width= True)
|
82 |
+
|
83 |
+
hits = hits.sort_values(by=['GHG Score'], ascending=False)
|
84 |
+
st.write("")
|
85 |
+
st.markdown("###### Top few GHG Target Classified paragraph/text results ######")
|
86 |
+
range_val = min(5,len(hits))
|
87 |
+
for i in range(range_val):
|
88 |
+
# the page number reflects the page that contains the main paragraph
|
89 |
+
# according to split limit, the overlapping part can be on a separate page
|
90 |
+
st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['GHG Score']))
|
91 |
+
st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
|
92 |
+
else:
|
93 |
+
st.info("🤔 No GHG target found")
|
94 |
+
|
95 |
+
|
appStore/info.py
DELETED
@@ -1,67 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import os
|
3 |
-
from PIL import Image
|
4 |
-
_ROOT = os.path.abspath(os.path.dirname(__file__))
|
5 |
-
def get_data(path):
|
6 |
-
return os.path.join(_ROOT, 'data', path)
|
7 |
-
|
8 |
-
def app():
|
9 |
-
|
10 |
-
|
11 |
-
with open('style.css') as f:
|
12 |
-
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
13 |
-
|
14 |
-
st.markdown("<h2 style='text-align: center; \
|
15 |
-
color: black;'> Climate Policy Understanding App</h2>",
|
16 |
-
unsafe_allow_html=True)
|
17 |
-
|
18 |
-
|
19 |
-
st.markdown("<div style='text-align: center; \
|
20 |
-
color: grey;'>Climate Policy Understanding App is an open-source\
|
21 |
-
digital tool which aims to assist policy analysts and \
|
22 |
-
other users in extracting and filtering relevant \
|
23 |
-
information from public documents.</div>",
|
24 |
-
unsafe_allow_html=True)
|
25 |
-
footer = """
|
26 |
-
<div class="footer-custom">
|
27 |
-
Guidance & Feedback - <a>Nadja Taeger</a> |<a>Marie Hertel</a> | <a>Cecile Schneider</a> |
|
28 |
-
Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a> |
|
29 |
-
<a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
|
30 |
-
|
31 |
-
</div>
|
32 |
-
"""
|
33 |
-
st.markdown(footer, unsafe_allow_html=True)
|
34 |
-
|
35 |
-
c1, c2, c3 = st.columns([8,1,12])
|
36 |
-
with c1:
|
37 |
-
image = Image.open('docStore/img/ndc.png')
|
38 |
-
st.image(image)
|
39 |
-
with c3:
|
40 |
-
st.markdown('<div style="text-align: justify;">The manual extraction \
|
41 |
-
of relevant information from text documents is a \
|
42 |
-
time-consuming task for any policy analysts. As the amount and length of \
|
43 |
-
public policy documents in relation to sustainable development (such as \
|
44 |
-
National Development Plans and Nationally Determined Contributions) \
|
45 |
-
continuously increases, a major challenge for policy action tracking – the \
|
46 |
-
evaluation of stated goals and targets and their actual implementation on \
|
47 |
-
the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
|
48 |
-
Language Processing (NLP) methods can help in shortening and easing this \
|
49 |
-
task for policy analysts.</div><br>',
|
50 |
-
unsafe_allow_html=True)
|
51 |
-
|
52 |
-
intro = """
|
53 |
-
<div style="text-align: justify;">
|
54 |
-
|
55 |
-
For this purpose, IKI Tracs, SV KLIMA, SPA and Data Service Center (Deutsche Gesellschaft für Internationale \
|
56 |
-
Zusammenarbeit (GIZ) GmbH) are collaborating since 2022 in the development \
|
57 |
-
of an AI-powered open-source web application that helps find and extract \
|
58 |
-
relevant information from public policy documents faster to facilitate \
|
59 |
-
evidence-based decision-making processes in sustainable development and beyond.
|
60 |
-
|
61 |
-
|
62 |
-
</div>
|
63 |
-
<br>
|
64 |
-
"""
|
65 |
-
st.markdown(intro, unsafe_allow_html=True)
|
66 |
-
image2 = Image.open('docStore/img/paris.png')
|
67 |
-
st.image(image2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
appStore/multiapp.py
DELETED
@@ -1,67 +0,0 @@
|
|
1 |
-
"""Frameworks for running multiple Streamlit applications as a single app.
|
2 |
-
"""
|
3 |
-
import streamlit as st
|
4 |
-
from PIL import Image
|
5 |
-
from utils.uploadAndExample import add_upload
|
6 |
-
|
7 |
-
class MultiApp:
|
8 |
-
"""Framework for combining multiple streamlit applications.
|
9 |
-
Usage:
|
10 |
-
def foo():
|
11 |
-
st.title("Hello Foo")
|
12 |
-
def bar():
|
13 |
-
st.title("Hello Bar")
|
14 |
-
app = MultiApp()
|
15 |
-
app.add_app("Foo", foo)
|
16 |
-
app.add_app("Bar", bar)
|
17 |
-
app.run()
|
18 |
-
It is also possible keep each application in a separate file.
|
19 |
-
import foo
|
20 |
-
import bar
|
21 |
-
app = MultiApp()
|
22 |
-
app.add_app("Foo", foo.app)
|
23 |
-
app.add_app("Bar", bar.app)
|
24 |
-
app.run()
|
25 |
-
"""
|
26 |
-
def __init__(self):
|
27 |
-
self.apps = []
|
28 |
-
|
29 |
-
def add_app(self,title,icon, func):
|
30 |
-
"""Adds a new application.
|
31 |
-
Parameters
|
32 |
-
----------
|
33 |
-
func:
|
34 |
-
the python function to render this app.
|
35 |
-
title:
|
36 |
-
title of the app. Appears in the dropdown in the sidebar.
|
37 |
-
"""
|
38 |
-
self.apps.append({
|
39 |
-
"title": title,
|
40 |
-
"icon": icon,
|
41 |
-
"function": func
|
42 |
-
})
|
43 |
-
|
44 |
-
def run(self):
|
45 |
-
|
46 |
-
st.sidebar.write(format_func=lambda app: app['title'])
|
47 |
-
#image = Image.open('docStore/img/dsc_giz.png')
|
48 |
-
#st.sidebar.image(image, width =200)
|
49 |
-
|
50 |
-
with st.sidebar:
|
51 |
-
selected = st.selectbox("Select the Task to perform", [page["title"] for page in self.apps],)
|
52 |
-
st.markdown("---")
|
53 |
-
|
54 |
-
|
55 |
-
for index, item in enumerate(self.apps):
|
56 |
-
if item["title"] == selected:
|
57 |
-
self.apps[index]["function"]()
|
58 |
-
break
|
59 |
-
|
60 |
-
|
61 |
-
choice = st.sidebar.radio(label = 'Select the Document',
|
62 |
-
help = 'You can upload the document \
|
63 |
-
or else you can try a example document',
|
64 |
-
options = ('Upload Document', 'Try Example'),
|
65 |
-
horizontal = True)
|
66 |
-
add_upload(choice)
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
appStore/netzero.py
CHANGED
@@ -8,11 +8,7 @@ import matplotlib.pyplot as plt
|
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
10 |
import streamlit as st
|
11 |
-
|
12 |
-
# from st_aggrid.shared import ColumnsAutoSizeMode
|
13 |
-
from utils.netzero_classifier import netzero_classification
|
14 |
-
from utils.netzero_classifier import runNetZeroPreprocessingPipeline, load_netzeroClassifier
|
15 |
-
# from utils.keyword_extraction import textrank
|
16 |
import logging
|
17 |
logger = logging.getLogger(__name__)
|
18 |
from utils.config import get_classifier_params
|
@@ -28,6 +24,7 @@ params = get_classifier_params(classifier_identifier)
|
|
28 |
# Labels dictionary ###
|
29 |
_lab_dict = {
|
30 |
'NEGATIVE':'NO NETZERO TARGET',
|
|
|
31 |
'NETZERO':'NETZERO TARGET',
|
32 |
}
|
33 |
|
@@ -48,159 +45,51 @@ def to_excel(df):
|
|
48 |
return processed_data
|
49 |
|
50 |
def app():
|
51 |
-
|
52 |
-
#### APP INFO #####
|
53 |
-
with st.container():
|
54 |
-
st.markdown("<h1 style='text-align: center; color: black;'> NetZero Target Extraction </h1>", unsafe_allow_html=True)
|
55 |
-
st.write(' ')
|
56 |
-
st.write(' ')
|
57 |
-
|
58 |
-
with st.expander("ℹ️ - About this app", expanded=False):
|
59 |
-
|
60 |
-
st.write(
|
61 |
-
"""
|
62 |
-
The **NetZero Extraction** app is an easy-to-use interface built \
|
63 |
-
in Streamlit for analyzing policy documents for \
|
64 |
-
Classification of the paragraphs/texts in the document *If it \
|
65 |
-
contains any Net-Zero target related information* - \
|
66 |
-
developed by GIZ Data Service Center, GFA, IKI Tracs, \
|
67 |
-
SV Klima and SPA. \n
|
68 |
-
""")
|
69 |
-
st.write("""**Document Processing:** The Uploaded/Selected document is \
|
70 |
-
automatically cleaned and split into paragraphs with a maximum \
|
71 |
-
length of 60 words using a Haystack preprocessing pipeline. The \
|
72 |
-
length of 60 is an empirical value which should reflect the length \
|
73 |
-
of a “context” and should limit the paragraph length deviation. \
|
74 |
-
However, since we want to respect the sentence boundary the limit \
|
75 |
-
can breach and hence this limit of 60 is tentative. \n
|
76 |
-
""")
|
77 |
-
|
78 |
-
st.write("")
|
79 |
-
|
80 |
### Main app code ###
|
81 |
with st.container():
|
82 |
-
|
83 |
-
|
84 |
-
st.session_state['key2'] = None
|
85 |
|
86 |
-
if 'filepath' in st.session_state:
|
87 |
-
file_name = st.session_state['filename']
|
88 |
-
file_path = st.session_state['filepath']
|
89 |
-
|
90 |
-
# Do the preprocessing of the PDF
|
91 |
-
|
92 |
-
all_documents = runNetZeroPreprocessingPipeline(file_name= file_name,
|
93 |
-
file_path= file_path, split_by= params['split_by'],
|
94 |
-
split_length= params['split_length'],
|
95 |
-
split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
|
96 |
-
split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
|
97 |
-
|
98 |
-
# st.dataframe(all_documents['documents'])
|
99 |
-
|
100 |
# Load the classifier model
|
101 |
-
|
102 |
classifier = load_netzeroClassifier(classifier_name=params['model_name'])
|
103 |
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
104 |
|
105 |
-
if
|
106 |
warning_msg = ": This might take sometime, please sit back and relax."
|
107 |
else:
|
108 |
warning_msg = ""
|
109 |
|
110 |
-
|
111 |
-
# with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
|
112 |
-
|
113 |
-
df = netzero_classification(haystack_doc=all_documents['documents'],
|
114 |
threshold= params['threshold'])
|
115 |
-
st.session_state.
|
116 |
-
hits = df[df['Target Label'] == 'NETZERO']
|
117 |
-
range_val = min(5,len(hits))
|
118 |
-
if range_val !=0:
|
119 |
-
count_df = df['Target Label'].value_counts()
|
120 |
-
count_df = count_df.rename('count')
|
121 |
-
count_df = count_df.rename_axis('Target Label').reset_index()
|
122 |
-
count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
123 |
-
|
124 |
-
fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
|
125 |
-
c1, c2 = st.columns([1,1])
|
126 |
-
with c1:
|
127 |
-
st.plotly_chart(fig,use_container_width= True)
|
128 |
-
|
129 |
-
hits = hits.sort_values(by=['Relevancy'], ascending=False)
|
130 |
-
st.write("")
|
131 |
-
st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
|
132 |
-
range_val = min(5,len(hits))
|
133 |
-
for i in range(range_val):
|
134 |
-
# the page number reflects the page that contains the main paragraph
|
135 |
-
# according to split limit, the overlapping part can be on a separate page
|
136 |
-
st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
|
137 |
-
st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
|
138 |
-
else:
|
139 |
-
st.info("🤔 No Netzero target found")
|
140 |
-
df['Validation'] = 'No'
|
141 |
-
df_xlsx = to_excel(df)
|
142 |
-
st.download_button(label='📥 Download Current Result',
|
143 |
-
data=df_xlsx ,
|
144 |
-
file_name= 'file_target.xlsx')
|
145 |
-
|
146 |
-
|
147 |
-
else:
|
148 |
-
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
149 |
-
logging.warning("Terminated as no document provided")
|
150 |
-
|
151 |
-
# # Creating truth value dataframe
|
152 |
-
# if 'key2' in st.session_state:
|
153 |
-
# if st.session_state.key2 is not None:
|
154 |
-
# df = st.session_state.key2
|
155 |
-
# st.markdown("###### Select the threshold for classifier ######")
|
156 |
-
# c1, c2 = st.columns([1,1])
|
157 |
-
|
158 |
-
# netzero_df = df[df['Target Label'] == 'NETZERO'].reset_index(drop = True)
|
159 |
-
# if len(netzero_df) >0:
|
160 |
-
# with c1:
|
161 |
-
# threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
|
162 |
-
# step=0.01, value=0.5,
|
163 |
-
# help = "Keep High Value if want refined result, low if dont want to miss anything" )
|
164 |
-
|
165 |
-
# # creating the dataframe for value counts of Labels, along with 'title' of Labels
|
166 |
-
# temp = df[df['Relevancy']>threshold]
|
167 |
-
# count_df = temp['Target Label'].value_counts()
|
168 |
-
# count_df = count_df.rename('count')
|
169 |
-
# count_df = count_df.rename_axis('Target Label').reset_index()
|
170 |
-
# count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
171 |
-
|
172 |
-
# plt.rcParams['font.size'] = 25
|
173 |
-
# colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
|
174 |
-
# # plot
|
175 |
-
# fig, ax = plt.subplots()
|
176 |
-
# ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
|
177 |
-
# wedgeprops={"linewidth": 1, "edgecolor": "white"},
|
178 |
-
# textprops={'fontsize': 14},
|
179 |
-
# frame=False,labels =list(count_df.Label_def),
|
180 |
-
# labeldistance=1.2)
|
181 |
-
# st.markdown("#### Anything related to NetZero Targets? ####")
|
182 |
-
|
183 |
-
# c4, c5, c6 = st.columns([1,2,2])
|
184 |
-
|
185 |
-
# with c5:
|
186 |
-
# st.pyplot(fig)
|
187 |
-
# with c6:
|
188 |
-
# st.write(count_df[['Label_def','count']])
|
189 |
-
|
190 |
-
# st.write("")
|
191 |
|
192 |
-
# st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
|
193 |
-
|
194 |
-
# st.dataframe(netzero_df.head())
|
195 |
-
# else:
|
196 |
-
# st.write("🤔 No Results found")
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
|
|
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
10 |
import streamlit as st
|
11 |
+
from utils.netzero_classifier import load_netzeroClassifier, netzero_classification
|
|
|
|
|
|
|
|
|
12 |
import logging
|
13 |
logger = logging.getLogger(__name__)
|
14 |
from utils.config import get_classifier_params
|
|
|
24 |
# Labels dictionary ###
|
25 |
_lab_dict = {
|
26 |
'NEGATIVE':'NO NETZERO TARGET',
|
27 |
+
'NA':'NOT APPLICABLE',
|
28 |
'NETZERO':'NETZERO TARGET',
|
29 |
}
|
30 |
|
|
|
45 |
return processed_data
|
46 |
|
47 |
def app():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
### Main app code ###
|
49 |
with st.container():
|
50 |
+
if 'key1' in st.session_state:
|
51 |
+
df = st.session_state.key1
|
|
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
# Load the classifier model
|
|
|
54 |
classifier = load_netzeroClassifier(classifier_name=params['model_name'])
|
55 |
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
56 |
|
57 |
+
if sum(df['Target Label'] == 'TARGET') > 100:
|
58 |
warning_msg = ": This might take sometime, please sit back and relax."
|
59 |
else:
|
60 |
warning_msg = ""
|
61 |
|
62 |
+
df = netzero_classification(haystack_doc=df,
|
|
|
|
|
|
|
63 |
threshold= params['threshold'])
|
64 |
+
st.session_state.key1 = df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
def netzero_display():
|
68 |
+
if 'key1' in st.session_state:
|
69 |
+
df = st.session_state.key2
|
70 |
+
hits = df[df['Netzero Label'] == 'NETZERO']
|
71 |
+
range_val = min(5,len(hits))
|
72 |
+
if range_val !=0:
|
73 |
+
count_df = df['Netzero Label'].value_counts()
|
74 |
+
count_df = count_df.rename('count')
|
75 |
+
count_df = count_df.rename_axis('Netzero Label').reset_index()
|
76 |
+
count_df['Label_def'] = count_df['Netzero Label'].apply(lambda x: _lab_dict[x])
|
77 |
+
|
78 |
+
fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
|
79 |
+
c1, c2 = st.columns([1,1])
|
80 |
+
with c1:
|
81 |
+
st.plotly_chart(fig,use_container_width= True)
|
82 |
+
|
83 |
+
hits = hits.sort_values(by=['Netzero Score'], ascending=False)
|
84 |
+
st.write("")
|
85 |
+
st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
|
86 |
+
range_val = min(5,len(hits))
|
87 |
+
for i in range(range_val):
|
88 |
+
# the page number reflects the page that contains the main paragraph
|
89 |
+
# according to split limit, the overlapping part can be on a separate page
|
90 |
+
st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Netzero Score']))
|
91 |
+
st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
|
92 |
+
else:
|
93 |
+
st.info("🤔 No Netzero target found")
|
94 |
|
95 |
|
appStore/sector.py
CHANGED
@@ -8,11 +8,7 @@ import matplotlib.pyplot as plt
|
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
10 |
import streamlit as st
|
11 |
-
|
12 |
-
# from st_aggrid.shared import ColumnsAutoSizeMode
|
13 |
-
from utils.sector_classifier import sector_classification
|
14 |
-
from utils.sector_classifier import runSectorPreprocessingPipeline, load_sectorClassifier
|
15 |
-
# from utils.keyword_extraction import textrank
|
16 |
import logging
|
17 |
logger = logging.getLogger(__name__)
|
18 |
from utils.config import get_classifier_params
|
@@ -58,107 +54,68 @@ def to_excel(df,sectorlist):
|
|
58 |
|
59 |
def app():
|
60 |
|
61 |
-
#### APP INFO #####
|
62 |
-
with st.container():
|
63 |
-
st.markdown("<h1 style='text-align: center; color: black;'> Sector Classification </h1>", unsafe_allow_html=True)
|
64 |
-
st.write(' ')
|
65 |
-
st.write(' ')
|
66 |
-
|
67 |
-
with st.expander("ℹ️ - About this app", expanded=False):
|
68 |
-
|
69 |
-
st.write(
|
70 |
-
"""
|
71 |
-
The **Sector Classification** app is an easy-to-use interface built \
|
72 |
-
in Streamlit for analyzing policy documents for \
|
73 |
-
Classification of the paragraphs/texts in the document *If it \
|
74 |
-
belongs to particular sector or not*. The paragraph can belong to multiple sectors - \
|
75 |
-
developed by GIZ Data Service Center, GFA, IKI Tracs, \
|
76 |
-
SV Klima and SPA. \n
|
77 |
-
""")
|
78 |
-
st.write("""**Document Processing:** The Uploaded/Selected document is \
|
79 |
-
automatically cleaned and split into paragraphs with a maximum \
|
80 |
-
length of 60 words using a Haystack preprocessing pipeline. The \
|
81 |
-
length of 60 is an empirical value which should reflect the length \
|
82 |
-
of a “context” and should limit the paragraph length deviation. \
|
83 |
-
However, since we want to respect the sentence boundary the limit \
|
84 |
-
can breach and hence this limit of 60 is tentative. \n
|
85 |
-
""")
|
86 |
-
|
87 |
-
st.write("")
|
88 |
-
|
89 |
### Main app code ###
|
90 |
with st.container():
|
91 |
-
if st.button("RUN Sector Classification"):
|
92 |
-
if 'key' not in st.session_state:
|
93 |
-
st.session_state['key'] = None
|
94 |
|
95 |
-
if '
|
96 |
-
|
97 |
-
file_path = st.session_state['filepath']
|
98 |
-
|
99 |
-
|
100 |
-
all_documents = runSectorPreprocessingPipeline(file_name= file_name,
|
101 |
-
file_path= file_path, split_by= params['split_by'],
|
102 |
-
split_length= params['split_length'],
|
103 |
-
split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
|
104 |
-
split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
|
105 |
-
# st.write(all_documents['documents'])
|
106 |
classifier = load_sectorClassifier(classifier_name=params['model_name'])
|
107 |
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
108 |
-
|
109 |
-
if
|
110 |
warning_msg = ": This might take sometime, please sit back and relax."
|
111 |
else:
|
112 |
warning_msg = ""
|
113 |
-
|
114 |
-
# #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
|
115 |
-
# with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
|
116 |
|
117 |
-
df = sector_classification(haystack_doc=
|
118 |
threshold= params['threshold'])
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
#
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
#
|
139 |
-
|
140 |
-
|
141 |
-
#
|
142 |
-
#
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
# st.
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
df['
|
153 |
-
df['
|
154 |
-
df['
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
st.
|
161 |
-
|
|
|
|
|
|
|
|
|
162 |
|
163 |
# # Creating truth value dataframe
|
164 |
# if 'key' in st.session_state:
|
|
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
10 |
import streamlit as st
|
11 |
+
from utils.sector_classifier import load_sectorClassifier, sector_classification
|
|
|
|
|
|
|
|
|
12 |
import logging
|
13 |
logger = logging.getLogger(__name__)
|
14 |
from utils.config import get_classifier_params
|
|
|
54 |
|
55 |
def app():
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
### Main app code ###
|
58 |
with st.container():
|
|
|
|
|
|
|
59 |
|
60 |
+
if 'key1' in st.session_state:
|
61 |
+
df = st.session_state.key1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
classifier = load_sectorClassifier(classifier_name=params['model_name'])
|
63 |
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
64 |
+
|
65 |
+
if sum(df['Target Label'] == 'TARGET') > 100:
|
66 |
warning_msg = ": This might take sometime, please sit back and relax."
|
67 |
else:
|
68 |
warning_msg = ""
|
|
|
|
|
|
|
69 |
|
70 |
+
df = sector_classification(haystack_doc=df,
|
71 |
threshold= params['threshold'])
|
72 |
+
|
73 |
+
st.session_state.key1 = df
|
74 |
+
|
75 |
+
|
76 |
+
# # st.write(df)
|
77 |
+
# threshold= params['threshold']
|
78 |
+
# truth_df = df.drop(['text'],axis=1)
|
79 |
+
# truth_df = truth_df.astype(float) >= threshold
|
80 |
+
# truth_df = truth_df.astype(str)
|
81 |
+
# categories = list(truth_df.columns)
|
82 |
+
|
83 |
+
# placeholder = {}
|
84 |
+
# for val in categories:
|
85 |
+
# placeholder[val] = dict(truth_df[val].value_counts())
|
86 |
+
# count_df = pd.DataFrame.from_dict(placeholder)
|
87 |
+
# count_df = count_df.T
|
88 |
+
# count_df = count_df.reset_index()
|
89 |
+
# # st.write(count_df)
|
90 |
+
# placeholder = []
|
91 |
+
# for i in range(len(count_df)):
|
92 |
+
# placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
|
93 |
+
# placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
|
94 |
+
# count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
|
95 |
+
# # st.write("Total Paragraphs: {}".format(len(df)))
|
96 |
+
# fig = px.bar(count_df, x='category', y='count',
|
97 |
+
# color='truth_value')
|
98 |
+
# # c1, c2 = st.columns([1,1])
|
99 |
+
# # with c1:
|
100 |
+
# st.plotly_chart(fig,use_container_width= True)
|
101 |
+
|
102 |
+
# truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
|
103 |
+
# truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
|
104 |
+
# # st.write(truth_df)
|
105 |
+
# df = pd.concat([df,truth_df['labels']],axis=1)
|
106 |
+
# df['Validation'] = 'No'
|
107 |
+
# df['Sector1'] = 'Blank'
|
108 |
+
# df['Sector2'] = 'Blank'
|
109 |
+
# df['Sector3'] = 'Blank'
|
110 |
+
# df['Sector4'] = 'Blank'
|
111 |
+
# df['Sector5'] = 'Blank'
|
112 |
+
# df_xlsx = to_excel(df,categories)
|
113 |
+
# st.download_button(label='📥 Download Current Result',
|
114 |
+
# data=df_xlsx ,
|
115 |
+
# # file_name= 'file_sector.xlsx')
|
116 |
+
# else:
|
117 |
+
# st.info("🤔 No document found, please try to upload it at the sidebar!")
|
118 |
+
# logging.warning("Terminated as no document provided")
|
119 |
|
120 |
# # Creating truth value dataframe
|
121 |
# if 'key' in st.session_state:
|
appStore/target.py
CHANGED
@@ -8,11 +8,7 @@ import matplotlib.pyplot as plt
|
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
10 |
import streamlit as st
|
11 |
-
|
12 |
-
# from st_aggrid.shared import ColumnsAutoSizeMode
|
13 |
-
from utils.target_classifier import target_classification
|
14 |
-
from utils.target_classifier import runTargetPreprocessingPipeline, load_targetClassifier
|
15 |
-
# from utils.keyword_extraction import textrank
|
16 |
import logging
|
17 |
logger = logging.getLogger(__name__)
|
18 |
from utils.config import get_classifier_params
|
@@ -26,8 +22,8 @@ params = get_classifier_params(classifier_identifier)
|
|
26 |
|
27 |
## Labels dictionary ###
|
28 |
_lab_dict = {
|
29 |
-
'
|
30 |
-
'
|
31 |
}
|
32 |
|
33 |
@st.cache_data
|
@@ -48,164 +44,68 @@ def to_excel(df):
|
|
48 |
def app():
|
49 |
|
50 |
#### APP INFO #####
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
57 |
|
58 |
-
st.write(
|
59 |
-
"""
|
60 |
-
The **Target Extraction** app is an easy-to-use interface built \
|
61 |
-
in Streamlit for analyzing policy documents for \
|
62 |
-
Classification of the paragraphs/texts in the document *If it \
|
63 |
-
contains any Economy-Wide Targets related information* - \
|
64 |
-
developed by GIZ Data Service Center, GFA, IKI Tracs, \
|
65 |
-
SV Klima and SPA. \n
|
66 |
-
""")
|
67 |
-
st.write("""**Document Processing:** The Uploaded/Selected document is \
|
68 |
-
automatically cleaned and split into paragraphs with a maximum \
|
69 |
-
length of 60 words using a Haystack preprocessing pipeline. The \
|
70 |
-
length of 60 is an empirical value which should reflect the length \
|
71 |
-
of a “context” and should limit the paragraph length deviation. \
|
72 |
-
However, since we want to respect the sentence boundary the limit \
|
73 |
-
can breach and hence this limit of 60 is tentative. \n
|
74 |
-
""")
|
75 |
-
|
76 |
-
st.write("")
|
77 |
|
78 |
### Main app code ###
|
79 |
with st.container():
|
80 |
-
if st.
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
all_documents = runTargetPreprocessingPipeline(file_name= file_name,
|
90 |
-
file_path= file_path, split_by= params['split_by'],
|
91 |
-
split_length= params['split_length'],
|
92 |
-
split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
|
93 |
-
split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
|
94 |
-
# st.write(all_documents['documents'])
|
95 |
-
|
96 |
-
#load Classifier
|
97 |
-
classifier = load_targetClassifier(classifier_name=params['model_name'])
|
98 |
-
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
99 |
-
if len(all_documents['documents']) > 100:
|
100 |
-
warning_msg = ": This might take sometime, please sit back and relax."
|
101 |
-
else:
|
102 |
-
warning_msg = ""
|
103 |
-
|
104 |
-
# #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
|
105 |
-
# with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
|
106 |
-
|
107 |
-
df = target_classification(haystack_doc=all_documents['documents'],
|
108 |
-
threshold= params['threshold'])
|
109 |
-
st.session_state.key1 = df
|
110 |
-
# temp = df[df['Relevancy']>threshold]
|
111 |
-
hits = df[df['Target Label'] == 'LABEL_1']
|
112 |
-
range_val = min(5,len(hits))
|
113 |
-
if range_val !=0:
|
114 |
-
count_df = df['Target Label'].value_counts()
|
115 |
-
count_df = count_df.rename('count')
|
116 |
-
count_df = count_df.rename_axis('Target Label').reset_index()
|
117 |
-
count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
118 |
-
|
119 |
-
fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height=200)
|
120 |
-
c1, c2 = st.columns([1,1])
|
121 |
-
with c1:
|
122 |
-
st.plotly_chart(fig,use_container_width= True)
|
123 |
-
|
124 |
-
hits = hits.sort_values(by=['Relevancy'], ascending=False)
|
125 |
-
st.write("")
|
126 |
-
st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
|
127 |
-
range_val = min(5,len(hits))
|
128 |
-
for i in range(range_val):
|
129 |
-
# the page number reflects the page that contains the main paragraph
|
130 |
-
# according to split limit, the overlapping part can be on a separate page
|
131 |
-
st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
|
132 |
-
st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
|
133 |
-
|
134 |
-
else:
|
135 |
-
st.info("🤔 No Economy Wide Target found")
|
136 |
-
df['Validation'] = 'No'
|
137 |
-
df_xlsx = to_excel(df)
|
138 |
-
st.download_button(label='📥 Download Current Result',
|
139 |
-
data=df_xlsx ,
|
140 |
-
file_name= 'file_target.xlsx')
|
141 |
-
|
142 |
-
|
143 |
else:
|
144 |
-
|
145 |
-
logging.warning("Terminated as no document provided")
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
# # Creating truth value dataframe
|
162 |
-
# if 'key1' in st.session_state:
|
163 |
-
# if st.session_state.key1 is not None:
|
164 |
-
# df = st.session_state.key1
|
165 |
-
# st.markdown("###### Select the threshold for classifier ######")
|
166 |
-
# c1, c2 = st.columns([1,1])
|
167 |
-
|
168 |
-
# with c1:
|
169 |
-
# threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
|
170 |
-
# step=0.01, value=0.5,
|
171 |
-
# help = "Keep High Value if want refined result, low if dont want to miss anything" )
|
172 |
-
# sectors =set(df.columns)
|
173 |
-
# removecols = {'Validation','Sectors','text'}
|
174 |
-
# sectors = list(sectors - removecols)
|
175 |
-
|
176 |
-
# # creating the dataframe for value counts of Labels, along with 'title' of Labels
|
177 |
-
# temp = df[df['Relevancy']>threshold]
|
178 |
-
# count_df = temp['Target Label'].value_counts()
|
179 |
-
# count_df = count_df.rename('count')
|
180 |
-
# count_df = count_df.rename_axis('Target Label').reset_index()
|
181 |
-
# count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
182 |
-
|
183 |
-
# plt.rcParams['font.size'] = 25
|
184 |
-
# colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
|
185 |
-
# # plot
|
186 |
-
# fig, ax = plt.subplots()
|
187 |
-
# ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
|
188 |
-
# wedgeprops={"linewidth": 1, "edgecolor": "white"},
|
189 |
-
# textprops={'fontsize': 14},
|
190 |
-
# frame=False,labels =list(count_df.Label_def),
|
191 |
-
# labeldistance=1.2)
|
192 |
-
# st.markdown("#### Anything related to Targets? ####")
|
193 |
-
|
194 |
-
# c4, c5, c6 = st.columns([1,2,2])
|
195 |
-
|
196 |
-
# with c5:
|
197 |
-
# st.pyplot(fig)
|
198 |
-
# with c6:
|
199 |
-
# st.write(count_df[['Label_def','count']])
|
200 |
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
10 |
import streamlit as st
|
11 |
+
from utils.target_classifier import load_targetClassifier, target_classification
|
|
|
|
|
|
|
|
|
12 |
import logging
|
13 |
logger = logging.getLogger(__name__)
|
14 |
from utils.config import get_classifier_params
|
|
|
22 |
|
23 |
## Labels dictionary ###
|
24 |
_lab_dict = {
|
25 |
+
'NEGATIVE':'NO TARGET INFO',
|
26 |
+
'TARGET':'TARGET',
|
27 |
}
|
28 |
|
29 |
@st.cache_data
|
|
|
44 |
def app():
|
45 |
|
46 |
#### APP INFO #####
|
47 |
+
# st.write(
|
48 |
+
# """
|
49 |
+
# The **Target Extraction** app is an easy-to-use interface built \
|
50 |
+
# in Streamlit for analyzing policy documents for \
|
51 |
+
# Classification of the paragraphs/texts in the document *If it \
|
52 |
+
# contains any Economy-Wide Targets related information* - \
|
53 |
+
# developed by GIZ Data Service Center, GFA, IKI Tracs, \
|
54 |
+
# SV Klima and SPA. \n
|
55 |
+
# """)
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
### Main app code ###
|
59 |
with st.container():
|
60 |
+
if 'key0' in st.session_state:
|
61 |
+
df = st.session_state.key0
|
62 |
+
|
63 |
+
#load Classifier
|
64 |
+
classifier = load_targetClassifier(classifier_name=params['model_name'])
|
65 |
+
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
66 |
+
if len(df) > 100:
|
67 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
else:
|
69 |
+
warning_msg = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
+
df = target_classification(haystack_doc=df,
|
72 |
+
threshold= params['threshold'])
|
73 |
+
st.session_state.key1 = df
|
74 |
+
|
75 |
+
# # excel part
|
76 |
+
# temp = df[df['Relevancy']>threshold]
|
77 |
+
|
78 |
+
# df['Validation'] = 'No'
|
79 |
+
# df_xlsx = to_excel(df)
|
80 |
+
# st.download_button(label='�� Download Current Result',
|
81 |
+
# data=df_xlsx ,
|
82 |
+
# file_name= 'file_target.xlsx')
|
83 |
+
|
84 |
+
def target_display():
|
85 |
+
if 'key1' in st.session_state:
|
86 |
+
df = st.session_state.key1
|
87 |
+
hits = df[df['Target Label'] == 'TARGET']
|
88 |
+
range_val = min(5,len(hits))
|
89 |
+
if range_val !=0:
|
90 |
+
count_df = df['Target Label'].value_counts()
|
91 |
+
count_df = count_df.rename('count')
|
92 |
+
count_df = count_df.rename_axis('Target Label').reset_index()
|
93 |
+
count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
94 |
+
|
95 |
+
fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height=200)
|
96 |
+
c1, c2 = st.columns([1,1])
|
97 |
+
with c1:
|
98 |
+
st.plotly_chart(fig,use_container_width= True)
|
99 |
+
|
100 |
+
hits = hits.sort_values(by=['Relevancy'], ascending=False)
|
101 |
+
st.write("")
|
102 |
+
st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
|
103 |
+
range_val = min(5,len(hits))
|
104 |
+
for i in range(range_val):
|
105 |
+
# the page number reflects the page that contains the main paragraph
|
106 |
+
# according to split limit, the overlapping part can be on a separate page
|
107 |
+
st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
|
108 |
+
st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
|
109 |
+
|
110 |
+
else:
|
111 |
+
st.info("🤔 No Targets found")
|
paramconfig.cfg
CHANGED
@@ -1,6 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
[target]
|
2 |
THRESHOLD = 0.50
|
3 |
-
MODEL = mtyrrell/
|
4 |
SPLIT_BY = word
|
5 |
REMOVE_PUNC = 0
|
6 |
SPLIT_LENGTH = 60
|
@@ -36,4 +46,14 @@ REMOVE_PUNC = 0
|
|
36 |
SPLIT_LENGTH = 60
|
37 |
SPLIT_OVERLAP = 10
|
38 |
RESPECT_SENTENCE_BOUNDARY = 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
TOP_KEY = 10
|
|
|
1 |
+
[preprocessing]
|
2 |
+
THRESHOLD = 0.50
|
3 |
+
MODEL = garbage
|
4 |
+
SPLIT_BY = word
|
5 |
+
REMOVE_PUNC = 0
|
6 |
+
SPLIT_LENGTH = 60
|
7 |
+
SPLIT_OVERLAP = 10
|
8 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
9 |
+
TOP_KEY = 10
|
10 |
+
|
11 |
[target]
|
12 |
THRESHOLD = 0.50
|
13 |
+
MODEL = mtyrrell/ikitracs_target_mpnet
|
14 |
SPLIT_BY = word
|
15 |
REMOVE_PUNC = 0
|
16 |
SPLIT_LENGTH = 60
|
|
|
46 |
SPLIT_LENGTH = 60
|
47 |
SPLIT_OVERLAP = 10
|
48 |
RESPECT_SENTENCE_BOUNDARY = 1
|
49 |
+
TOP_KEY = 10
|
50 |
+
|
51 |
+
[ghg]
|
52 |
+
THRESHOLD = 0.50
|
53 |
+
MODEL = mtyrrell/ikitracs_transport_ghg
|
54 |
+
SPLIT_BY = word
|
55 |
+
REMOVE_PUNC = 0
|
56 |
+
SPLIT_LENGTH = 60
|
57 |
+
SPLIT_OVERLAP = 10
|
58 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
59 |
TOP_KEY = 10
|
utils/adapmit_classifier.py
CHANGED
@@ -34,10 +34,6 @@ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
|
|
34 |
classifier_name = config.get('adapmit','MODEL')
|
35 |
|
36 |
logging.info("Loading Adaptation Mitigation classifier")
|
37 |
-
# doc_classifier = TransformersDocumentClassifier(
|
38 |
-
# model_name_or_path=classifier_name,
|
39 |
-
# task="text-classification",
|
40 |
-
# top_k = None)
|
41 |
doc_classifier = pipeline("text-classification",
|
42 |
model=classifier_name,
|
43 |
return_all_scores=True,
|
@@ -47,51 +43,8 @@ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
|
|
47 |
return doc_classifier
|
48 |
|
49 |
|
50 |
-
def runAdapMitPreprocessingPipeline(file_name:str, file_path:str,
|
51 |
-
split_by: Literal["sentence", "word"] = 'sentence',
|
52 |
-
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
53 |
-
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
54 |
-
"""
|
55 |
-
creates the pipeline and runs the preprocessing pipeline,
|
56 |
-
the params for pipeline are fetched from paramconfig
|
57 |
-
Params
|
58 |
-
------------
|
59 |
-
file_name: filename, in case of streamlit application use
|
60 |
-
st.session_state['filename']
|
61 |
-
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
62 |
-
split_by: document splitting strategy either as word or sentence
|
63 |
-
split_length: when synthetically creating the paragrpahs from document,
|
64 |
-
it defines the length of paragraph.
|
65 |
-
split_respect_sentence_boundary: Used when using 'word' strategy for
|
66 |
-
splititng of text.
|
67 |
-
split_overlap: Number of words or sentences that overlap when creating
|
68 |
-
the paragraphs. This is done as one sentence or 'some words' make sense
|
69 |
-
when read in together with others. Therefore the overlap is used.
|
70 |
-
remove_punc: to remove all Punctuation including ',' and '.' or not
|
71 |
-
Return
|
72 |
-
--------------
|
73 |
-
List[Document]: When preprocessing pipeline is run, the output dictionary
|
74 |
-
has four objects. For the Haysatck implementation of SDG classification we,
|
75 |
-
need to use the List of Haystack Document, which can be fetched by
|
76 |
-
key = 'documents' on output.
|
77 |
-
"""
|
78 |
-
|
79 |
-
adapmit_processing_pipeline = processingpipeline()
|
80 |
-
|
81 |
-
output_adapmit_pre = adapmit_processing_pipeline.run(file_paths = file_path,
|
82 |
-
params= {"FileConverter": {"file_path": file_path, \
|
83 |
-
"file_name": file_name},
|
84 |
-
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
85 |
-
"split_by": split_by, \
|
86 |
-
"split_length":split_length,\
|
87 |
-
"split_overlap": split_overlap, \
|
88 |
-
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
89 |
-
|
90 |
-
return output_adapmit_pre
|
91 |
-
|
92 |
-
|
93 |
@st.cache_data
|
94 |
-
def adapmit_classification(haystack_doc:
|
95 |
threshold:float = 0.5,
|
96 |
classifier_model:pipeline= None
|
97 |
)->Tuple[DataFrame,Series]:
|
@@ -115,10 +68,14 @@ def adapmit_classification(haystack_doc:List[Document],
|
|
115 |
the number of times it is covered/discussed/count_of_paragraphs.
|
116 |
"""
|
117 |
logging.info("Working on Adaptation-Mitigation Identification")
|
|
|
|
|
|
|
|
|
118 |
if not classifier_model:
|
119 |
classifier_model = st.session_state['adapmit_classifier']
|
120 |
|
121 |
-
predictions = classifier_model(
|
122 |
# converting the predictions to desired format
|
123 |
list_ = []
|
124 |
for i in range(len(predictions)):
|
@@ -128,9 +85,17 @@ def adapmit_classification(haystack_doc:List[Document],
|
|
128 |
for j in range(len(temp)):
|
129 |
placeholder[temp[j]['label']] = temp[j]['score']
|
130 |
list_.append(placeholder)
|
131 |
-
labels_ = [{**
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
return df
|
|
|
34 |
classifier_name = config.get('adapmit','MODEL')
|
35 |
|
36 |
logging.info("Loading Adaptation Mitigation classifier")
|
|
|
|
|
|
|
|
|
37 |
doc_classifier = pipeline("text-classification",
|
38 |
model=classifier_name,
|
39 |
return_all_scores=True,
|
|
|
43 |
return doc_classifier
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
@st.cache_data
|
47 |
+
def adapmit_classification(haystack_doc:pd.DataFrame,
|
48 |
threshold:float = 0.5,
|
49 |
classifier_model:pipeline= None
|
50 |
)->Tuple[DataFrame,Series]:
|
|
|
68 |
the number of times it is covered/discussed/count_of_paragraphs.
|
69 |
"""
|
70 |
logging.info("Working on Adaptation-Mitigation Identification")
|
71 |
+
haystack_doc['Adapt-Mitig Label'] = 'NA'
|
72 |
+
df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
|
73 |
+
df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
|
74 |
+
|
75 |
if not classifier_model:
|
76 |
classifier_model = st.session_state['adapmit_classifier']
|
77 |
|
78 |
+
predictions = classifier_model(list(df1.text))
|
79 |
# converting the predictions to desired format
|
80 |
list_ = []
|
81 |
for i in range(len(predictions)):
|
|
|
85 |
for j in range(len(temp)):
|
86 |
placeholder[temp[j]['label']] = temp[j]['score']
|
87 |
list_.append(placeholder)
|
88 |
+
labels_ = [{**list_[l]} for l in range(len(predictions))]
|
89 |
+
truth_df = DataFrame.from_dict(labels_)
|
90 |
+
truth_df = truth_df.round(2)
|
91 |
+
truth_df = truth_df.astype(float) >= threshold
|
92 |
+
truth_df = truth_df.astype(str)
|
93 |
+
categories = list(truth_df.columns)
|
94 |
+
truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
|
95 |
+
else None for i in categories}, axis=1)
|
96 |
+
truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
|
97 |
+
list(x['Adapt-Mitig Label'] -{None}),axis=1)
|
98 |
+
df1['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
|
99 |
+
df = pd.concat([df,df1])
|
100 |
|
101 |
return df
|
utils/ghg_classifier.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes import TransformersDocumentClassifier
|
2 |
+
from haystack.schema import Document
|
3 |
+
from typing import List, Tuple
|
4 |
+
from typing_extensions import Literal
|
5 |
+
import logging
|
6 |
+
import pandas as pd
|
7 |
+
from pandas import DataFrame, Series
|
8 |
+
from utils.config import getconfig
|
9 |
+
from utils.preprocessing import processingpipeline
|
10 |
+
import streamlit as st
|
11 |
+
from transformers import pipeline
|
12 |
+
|
13 |
+
# Labels dictionary ###
|
14 |
+
_lab_dict = {
|
15 |
+
'NEGATIVE':'NO GHG TARGET',
|
16 |
+
'TARGET':'GHG TARGET',
|
17 |
+
}
|
18 |
+
|
19 |
+
@st.cache_resource
|
20 |
+
def load_ghgClassifier(config_file:str = None, classifier_name:str = None):
|
21 |
+
"""
|
22 |
+
loads the document classifier using haystack, where the name/path of model
|
23 |
+
in HF-hub as string is used to fetch the model object.Either configfile or
|
24 |
+
model should be passed.
|
25 |
+
1. https://docs.haystack.deepset.ai/reference/document-classifier-api
|
26 |
+
2. https://docs.haystack.deepset.ai/docs/document_classifier
|
27 |
+
Params
|
28 |
+
--------
|
29 |
+
config_file: config file path from which to read the model name
|
30 |
+
classifier_name: if modelname is passed, it takes a priority if not \
|
31 |
+
found then will look for configfile, else raise error.
|
32 |
+
Return: document classifier model
|
33 |
+
"""
|
34 |
+
if not classifier_name:
|
35 |
+
if not config_file:
|
36 |
+
logging.warning("Pass either model name or config file")
|
37 |
+
return
|
38 |
+
else:
|
39 |
+
config = getconfig(config_file)
|
40 |
+
classifier_name = config.get('ghg','MODEL')
|
41 |
+
|
42 |
+
logging.info("Loading ghg classifier")
|
43 |
+
doc_classifier = pipeline("text-classification",
|
44 |
+
model=classifier_name,
|
45 |
+
top_k =1)
|
46 |
+
|
47 |
+
return doc_classifier
|
48 |
+
|
49 |
+
|
50 |
+
@st.cache_data
|
51 |
+
def ghg_classification(haystack_doc:pd.DataFrame,
|
52 |
+
threshold:float = 0.5,
|
53 |
+
classifier_model:pipeline= None
|
54 |
+
)->Tuple[DataFrame,Series]:
|
55 |
+
"""
|
56 |
+
Text-Classification on the list of texts provided. Classifier provides the
|
57 |
+
most appropriate label for each text. these labels are in terms of if text
|
58 |
+
belongs to which particular Sustainable Devleopment Goal (SDG).
|
59 |
+
Params
|
60 |
+
---------
|
61 |
+
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
62 |
+
contains the list of paragraphs in different format,here the list of
|
63 |
+
Haystack Documents is used.
|
64 |
+
threshold: threshold value for the model to keep the results from classifier
|
65 |
+
classifiermodel: you can pass the classifier model directly,which takes priority
|
66 |
+
however if not then looks for model in streamlit session.
|
67 |
+
In case of streamlit avoid passing the model directly.
|
68 |
+
Returns
|
69 |
+
----------
|
70 |
+
df: Dataframe with two columns['SDG:int', 'text']
|
71 |
+
x: Series object with the unique SDG covered in the document uploaded and
|
72 |
+
the number of times it is covered/discussed/count_of_paragraphs.
|
73 |
+
"""
|
74 |
+
logging.info("Working on GHG Extraction")
|
75 |
+
haystack_doc['GHG Label'] = 'NA'
|
76 |
+
haystack_doc['GHG Score'] = 'NA'
|
77 |
+
temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
|
78 |
+
df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
|
79 |
+
|
80 |
+
if not classifier_model:
|
81 |
+
classifier_model = st.session_state['ghg_classifier']
|
82 |
+
|
83 |
+
results = classifier_model(list(temp.text))
|
84 |
+
labels_= [(l[0]['label'],l[0]['score']) for l in results]
|
85 |
+
temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
|
86 |
+
df = pd.concat([df,temp])
|
87 |
+
df = df.reset_index(drop =True)
|
88 |
+
df.index += 1
|
89 |
+
|
90 |
+
return df
|
utils/netzero_classifier.py
CHANGED
@@ -8,6 +8,7 @@ from pandas import DataFrame, Series
|
|
8 |
from utils.config import getconfig
|
9 |
from utils.preprocessing import processingpipeline
|
10 |
import streamlit as st
|
|
|
11 |
|
12 |
# Labels dictionary ###
|
13 |
_lab_dict = {
|
@@ -39,60 +40,17 @@ def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
|
|
39 |
classifier_name = config.get('netzero','MODEL')
|
40 |
|
41 |
logging.info("Loading netzero classifier")
|
42 |
-
doc_classifier =
|
43 |
-
|
44 |
-
|
45 |
|
46 |
return doc_classifier
|
47 |
|
48 |
|
49 |
-
def runNetZeroPreprocessingPipeline(file_name:str, file_path:str,
|
50 |
-
split_by: Literal["sentence", "word"] = 'sentence',
|
51 |
-
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
52 |
-
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
53 |
-
"""
|
54 |
-
creates the pipeline and runs the preprocessing pipeline,
|
55 |
-
the params for pipeline are fetched from paramconfig
|
56 |
-
Params
|
57 |
-
------------
|
58 |
-
file_name: filename, in case of streamlit application use
|
59 |
-
st.session_state['filename']
|
60 |
-
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
61 |
-
split_by: document splitting strategy either as word or sentence
|
62 |
-
split_length: when synthetically creating the paragrpahs from document,
|
63 |
-
it defines the length of paragraph.
|
64 |
-
split_respect_sentence_boundary: Used when using 'word' strategy for
|
65 |
-
splititng of text.
|
66 |
-
split_overlap: Number of words or sentences that overlap when creating
|
67 |
-
the paragraphs. This is done as one sentence or 'some words' make sense
|
68 |
-
when read in together with others. Therefore the overlap is used.
|
69 |
-
remove_punc: to remove all Punctuation including ',' and '.' or not
|
70 |
-
Return
|
71 |
-
--------------
|
72 |
-
List[Document]: When preprocessing pipeline is run, the output dictionary
|
73 |
-
has four objects. For the Haysatck implementation of SDG classification we,
|
74 |
-
need to use the List of Haystack Document, which can be fetched by
|
75 |
-
key = 'documents' on output.
|
76 |
-
"""
|
77 |
-
|
78 |
-
netzero_processing_pipeline = processingpipeline()
|
79 |
-
|
80 |
-
output_netzero_pre = netzero_processing_pipeline.run(file_paths = file_path,
|
81 |
-
params= {"FileConverter": {"file_path": file_path, \
|
82 |
-
"file_name": file_name},
|
83 |
-
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
84 |
-
"split_by": split_by, \
|
85 |
-
"split_length":split_length,\
|
86 |
-
"split_overlap": split_overlap, \
|
87 |
-
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
88 |
-
|
89 |
-
return output_netzero_pre
|
90 |
-
|
91 |
-
|
92 |
@st.cache_data
|
93 |
-
def netzero_classification(haystack_doc:
|
94 |
threshold:float = 0.8,
|
95 |
-
classifier_model:
|
96 |
)->Tuple[DataFrame,Series]:
|
97 |
"""
|
98 |
Text-Classification on the list of texts provided. Classifier provides the
|
@@ -114,24 +72,19 @@ def netzero_classification(haystack_doc:List[Document],
|
|
114 |
the number of times it is covered/discussed/count_of_paragraphs.
|
115 |
"""
|
116 |
logging.info("Working on Netzero Extraction")
|
|
|
|
|
|
|
|
|
|
|
117 |
if not classifier_model:
|
118 |
classifier_model = st.session_state['netzero_classifier']
|
119 |
|
120 |
-
results = classifier_model.
|
121 |
-
labels_= [(l
|
122 |
-
|
123 |
-
|
124 |
-
df =
|
125 |
-
|
126 |
-
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
127 |
df.index += 1
|
128 |
-
# df =df[df['Relevancy']>threshold]
|
129 |
-
df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
|
130 |
-
|
131 |
-
# creating the dataframe for value counts of Labels, along with 'title' of Labels
|
132 |
-
# count_df = df['Target Label'].value_counts()
|
133 |
-
# count_df = count_df.rename('count')
|
134 |
-
# count_df = count_df.rename_axis('Target Label').reset_index()
|
135 |
-
# count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
136 |
|
137 |
return df
|
|
|
8 |
from utils.config import getconfig
|
9 |
from utils.preprocessing import processingpipeline
|
10 |
import streamlit as st
|
11 |
+
from transformers import pipeline
|
12 |
|
13 |
# Labels dictionary ###
|
14 |
_lab_dict = {
|
|
|
40 |
classifier_name = config.get('netzero','MODEL')
|
41 |
|
42 |
logging.info("Loading netzero classifier")
|
43 |
+
doc_classifier = pipeline("text-classification",
|
44 |
+
model=classifier_name,
|
45 |
+
top_k =1)
|
46 |
|
47 |
return doc_classifier
|
48 |
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
@st.cache_data
|
51 |
+
def netzero_classification(haystack_doc:pd.DataFrame,
|
52 |
threshold:float = 0.8,
|
53 |
+
classifier_model:pipeline= None
|
54 |
)->Tuple[DataFrame,Series]:
|
55 |
"""
|
56 |
Text-Classification on the list of texts provided. Classifier provides the
|
|
|
72 |
the number of times it is covered/discussed/count_of_paragraphs.
|
73 |
"""
|
74 |
logging.info("Working on Netzero Extraction")
|
75 |
+
haystack_doc['Netzero Label'] = 'NA'
|
76 |
+
haystack_doc['Netzero Score'] = 'NA'
|
77 |
+
temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
|
78 |
+
df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
|
79 |
+
|
80 |
if not classifier_model:
|
81 |
classifier_model = st.session_state['netzero_classifier']
|
82 |
|
83 |
+
results = classifier_model(list(temp.text))
|
84 |
+
labels_= [(l[0]['label'],l[0]['score']) for l in results]
|
85 |
+
temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
|
86 |
+
df = pd.concat([df,temp])
|
87 |
+
df = df.reset_index(drop =True)
|
|
|
|
|
88 |
df.index += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
return df
|
utils/preprocessing.py
CHANGED
@@ -150,20 +150,36 @@ def basic(s:str, remove_punc:bool = False):
|
|
150 |
|
151 |
return s.strip()
|
152 |
|
153 |
-
def paraLengthCheck(paraList, max_len =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
new_para_list = []
|
155 |
for passage in paraList:
|
156 |
-
if
|
157 |
-
|
158 |
-
|
|
|
|
|
159 |
for i in range(iterations):
|
160 |
-
temp = " ".join(passage.split()[max_len*i:max_len*(i+1)])
|
161 |
-
new_para_list.append(temp)
|
162 |
-
temp = " ".join(passage.split()[max_len*(i+1):])
|
163 |
-
new_para_list.append(temp)
|
164 |
else:
|
165 |
-
|
166 |
-
|
|
|
|
|
167 |
return new_para_list
|
168 |
|
169 |
class UdfPreProcessor(BaseComponent):
|
|
|
150 |
|
151 |
return s.strip()
|
152 |
|
153 |
+
def paraLengthCheck(paraList, max_len = 100):
|
154 |
+
"""
|
155 |
+
There are cases where preprocessor cannot respect word limit, when using
|
156 |
+
respect sentence boundary flag due to missing sentence boundaries.
|
157 |
+
Therefore we run one more round of split here for those paragraphs
|
158 |
+
|
159 |
+
Params
|
160 |
+
---------------
|
161 |
+
paraList : list of paragraphs/text
|
162 |
+
max_len : max length to be respected by sentences which bypassed
|
163 |
+
preprocessor strategy
|
164 |
+
|
165 |
+
"""
|
166 |
new_para_list = []
|
167 |
for passage in paraList:
|
168 |
+
# check if para exceeds words limit
|
169 |
+
if len(passage.content.split()) > max_len:
|
170 |
+
# we might need few iterations example if para = 512 tokens
|
171 |
+
# we need to iterate 5 times to reduce para to size limit of '100'
|
172 |
+
iterations = int(len(passage.content.split())/max_len)
|
173 |
for i in range(iterations):
|
174 |
+
temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
|
175 |
+
new_para_list.append((temp,passage.meta['page']))
|
176 |
+
temp = " ".join(passage.content.split()[max_len*(i+1):])
|
177 |
+
new_para_list.append((temp,passage.meta['page']))
|
178 |
else:
|
179 |
+
# paragraphs which dont need any splitting
|
180 |
+
new_para_list.append((passage.content, passage.meta['page']))
|
181 |
+
|
182 |
+
logging.info("New paragraphs length {}".format(len(new_para_list)))
|
183 |
return new_para_list
|
184 |
|
185 |
class UdfPreProcessor(BaseComponent):
|
utils/sector_classifier.py
CHANGED
@@ -11,12 +11,6 @@ from haystack.nodes import TransformersDocumentClassifier
|
|
11 |
from transformers import pipeline
|
12 |
|
13 |
|
14 |
-
# # Labels dictionary ###
|
15 |
-
# _lab_dict = {
|
16 |
-
# 'NEGATIVE':'NO NETZERO TARGET',
|
17 |
-
# 'NETZERO':'NETZERO TARGET',
|
18 |
-
# }
|
19 |
-
|
20 |
@st.cache_resource
|
21 |
def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
|
22 |
"""
|
@@ -58,53 +52,10 @@ def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
|
|
58 |
return doc_classifier
|
59 |
|
60 |
|
61 |
-
def runSectorPreprocessingPipeline(file_name:str, file_path:str,
|
62 |
-
split_by: Literal["sentence", "word"] = 'sentence',
|
63 |
-
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
64 |
-
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
65 |
-
"""
|
66 |
-
creates the pipeline and runs the preprocessing pipeline,
|
67 |
-
the params for pipeline are fetched from paramconfig
|
68 |
-
Params
|
69 |
-
------------
|
70 |
-
file_name: filename, in case of streamlit application use
|
71 |
-
st.session_state['filename']
|
72 |
-
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
73 |
-
split_by: document splitting strategy either as word or sentence
|
74 |
-
split_length: when synthetically creating the paragrpahs from document,
|
75 |
-
it defines the length of paragraph.
|
76 |
-
split_respect_sentence_boundary: Used when using 'word' strategy for
|
77 |
-
splititng of text.
|
78 |
-
split_overlap: Number of words or sentences that overlap when creating
|
79 |
-
the paragraphs. This is done as one sentence or 'some words' make sense
|
80 |
-
when read in together with others. Therefore the overlap is used.
|
81 |
-
remove_punc: to remove all Punctuation including ',' and '.' or not
|
82 |
-
Return
|
83 |
-
--------------
|
84 |
-
List[Document]: When preprocessing pipeline is run, the output dictionary
|
85 |
-
has four objects. For the Haysatck implementation of SDG classification we,
|
86 |
-
need to use the List of Haystack Document, which can be fetched by
|
87 |
-
key = 'documents' on output.
|
88 |
-
"""
|
89 |
-
|
90 |
-
sector_processing_pipeline = processingpipeline()
|
91 |
-
|
92 |
-
output_sector_pre = sector_processing_pipeline.run(file_paths = file_path,
|
93 |
-
params= {"FileConverter": {"file_path": file_path, \
|
94 |
-
"file_name": file_name},
|
95 |
-
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
96 |
-
"split_by": split_by, \
|
97 |
-
"split_length":split_length,\
|
98 |
-
"split_overlap": split_overlap, \
|
99 |
-
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
100 |
-
|
101 |
-
return output_sector_pre
|
102 |
-
|
103 |
-
|
104 |
@st.cache_data
|
105 |
-
def sector_classification(haystack_doc:
|
106 |
-
threshold:float = 0.
|
107 |
-
classifier_model:
|
108 |
)->Tuple[DataFrame,Series]:
|
109 |
"""
|
110 |
Text-Classification on the list of texts provided. Classifier provides the
|
@@ -126,10 +77,14 @@ def sector_classification(haystack_doc:List[Document],
|
|
126 |
the number of times it is covered/discussed/count_of_paragraphs.
|
127 |
"""
|
128 |
logging.info("Working on Sector Identification")
|
|
|
|
|
|
|
129 |
if not classifier_model:
|
130 |
classifier_model = st.session_state['sector_classifier']
|
131 |
|
132 |
-
predictions = classifier_model(
|
|
|
133 |
list_ = []
|
134 |
for i in range(len(predictions)):
|
135 |
|
@@ -138,9 +93,16 @@ def sector_classification(haystack_doc:List[Document],
|
|
138 |
for j in range(len(temp)):
|
139 |
placeholder[temp[j]['label']] = temp[j]['score']
|
140 |
list_.append(placeholder)
|
141 |
-
labels_ = [{**
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
return df
|
|
|
11 |
from transformers import pipeline
|
12 |
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
@st.cache_resource
|
15 |
def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
|
16 |
"""
|
|
|
52 |
return doc_classifier
|
53 |
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
@st.cache_data
|
56 |
+
def sector_classification(haystack_doc:pd.DataFrame,
|
57 |
+
threshold:float = 0.5,
|
58 |
+
classifier_model:pipeline= None
|
59 |
)->Tuple[DataFrame,Series]:
|
60 |
"""
|
61 |
Text-Classification on the list of texts provided. Classifier provides the
|
|
|
77 |
the number of times it is covered/discussed/count_of_paragraphs.
|
78 |
"""
|
79 |
logging.info("Working on Sector Identification")
|
80 |
+
haystack_doc['Sector Label'] = 'NA'
|
81 |
+
df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
|
82 |
+
df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
|
83 |
if not classifier_model:
|
84 |
classifier_model = st.session_state['sector_classifier']
|
85 |
|
86 |
+
predictions = classifier_model(list(df1.text))
|
87 |
+
|
88 |
list_ = []
|
89 |
for i in range(len(predictions)):
|
90 |
|
|
|
93 |
for j in range(len(temp)):
|
94 |
placeholder[temp[j]['label']] = temp[j]['score']
|
95 |
list_.append(placeholder)
|
96 |
+
labels_ = [{**list_[l]} for l in range(len(predictions))]
|
97 |
+
truth_df = DataFrame.from_dict(labels_)
|
98 |
+
truth_df = truth_df.round(2)
|
99 |
+
truth_df = truth_df.astype(float) >= threshold
|
100 |
+
truth_df = truth_df.astype(str)
|
101 |
+
categories = list(truth_df.columns)
|
102 |
+
truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
|
103 |
+
None for i in categories}, axis=1)
|
104 |
+
truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
|
105 |
+
-{None}),axis=1)
|
106 |
+
df1['Sector Label'] = list(truth_df['Sector Label'])
|
107 |
+
df = pd.concat([df,df1])
|
108 |
return df
|
utils/target_classifier.py
CHANGED
@@ -8,11 +8,12 @@ from pandas import DataFrame, Series
|
|
8 |
from utils.config import getconfig
|
9 |
from utils.preprocessing import processingpipeline
|
10 |
import streamlit as st
|
|
|
11 |
|
12 |
## Labels dictionary ###
|
13 |
_lab_dict = {
|
14 |
-
'
|
15 |
-
'
|
16 |
}
|
17 |
|
18 |
@st.cache_resource
|
@@ -38,61 +39,19 @@ def load_targetClassifier(config_file:str = None, classifier_name:str = None):
|
|
38 |
config = getconfig(config_file)
|
39 |
classifier_name = config.get('target','MODEL')
|
40 |
|
41 |
-
logging.info("Loading classifier")
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
45 |
|
46 |
return doc_classifier
|
47 |
|
48 |
|
49 |
-
def runTargetPreprocessingPipeline(file_name:str, file_path:str,
|
50 |
-
split_by: Literal["sentence", "word"] = 'sentence',
|
51 |
-
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
52 |
-
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
53 |
-
"""
|
54 |
-
creates the pipeline and runs the preprocessing pipeline,
|
55 |
-
the params for pipeline are fetched from paramconfig
|
56 |
-
Params
|
57 |
-
------------
|
58 |
-
file_name: filename, in case of streamlit application use
|
59 |
-
st.session_state['filename']
|
60 |
-
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
61 |
-
split_by: document splitting strategy either as word or sentence
|
62 |
-
split_length: when synthetically creating the paragrpahs from document,
|
63 |
-
it defines the length of paragraph.
|
64 |
-
split_respect_sentence_boundary: Used when using 'word' strategy for
|
65 |
-
splititng of text.
|
66 |
-
split_overlap: Number of words or sentences that overlap when creating
|
67 |
-
the paragraphs. This is done as one sentence or 'some words' make sense
|
68 |
-
when read in together with others. Therefore the overlap is used.
|
69 |
-
remove_punc: to remove all Punctuation including ',' and '.' or not
|
70 |
-
Return
|
71 |
-
--------------
|
72 |
-
List[Document]: When preprocessing pipeline is run, the output dictionary
|
73 |
-
has four objects. For the Haysatck implementation of SDG classification we,
|
74 |
-
need to use the List of Haystack Document, which can be fetched by
|
75 |
-
key = 'documents' on output.
|
76 |
-
"""
|
77 |
-
|
78 |
-
target_processing_pipeline = processingpipeline()
|
79 |
-
|
80 |
-
output_target_pre = target_processing_pipeline.run(file_paths = file_path,
|
81 |
-
params= {"FileConverter": {"file_path": file_path, \
|
82 |
-
"file_name": file_name},
|
83 |
-
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
84 |
-
"split_by": split_by, \
|
85 |
-
"split_length":split_length,\
|
86 |
-
"split_overlap": split_overlap, \
|
87 |
-
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
88 |
-
|
89 |
-
return output_target_pre
|
90 |
-
|
91 |
-
|
92 |
@st.cache_data
|
93 |
-
def target_classification(haystack_doc:
|
94 |
-
threshold:float = 0.
|
95 |
-
classifier_model:
|
96 |
)->Tuple[DataFrame,Series]:
|
97 |
"""
|
98 |
Text-Classification on the list of texts provided. Classifier provides the
|
@@ -117,22 +76,16 @@ def target_classification(haystack_doc:List[Document],
|
|
117 |
if not classifier_model:
|
118 |
classifier_model = st.session_state['target_classifier']
|
119 |
|
120 |
-
results = classifier_model
|
121 |
-
labels_= [(l
|
122 |
-
l
|
123 |
|
124 |
|
125 |
-
|
|
|
126 |
|
127 |
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
128 |
df.index += 1
|
129 |
-
# df =df[df['Relevancy']>threshold]
|
130 |
df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
|
131 |
|
132 |
-
# creating the dataframe for value counts of Labels, along with 'title' of Labels
|
133 |
-
# count_df = df['Target Label'].value_counts()
|
134 |
-
# count_df = count_df.rename('count')
|
135 |
-
# count_df = count_df.rename_axis('Target Label').reset_index()
|
136 |
-
# count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
137 |
-
|
138 |
return df
|
|
|
8 |
from utils.config import getconfig
|
9 |
from utils.preprocessing import processingpipeline
|
10 |
import streamlit as st
|
11 |
+
from transformers import pipeline
|
12 |
|
13 |
## Labels dictionary ###
|
14 |
_lab_dict = {
|
15 |
+
'NEGATIVE':'NO TARGET INFO',
|
16 |
+
'TARGET':'TARGET',
|
17 |
}
|
18 |
|
19 |
@st.cache_resource
|
|
|
39 |
config = getconfig(config_file)
|
40 |
classifier_name = config.get('target','MODEL')
|
41 |
|
42 |
+
logging.info("Loading classifier")
|
43 |
+
|
44 |
+
doc_classifier = pipeline("text-classification",
|
45 |
+
model=classifier_name,
|
46 |
+
top_k =1)
|
47 |
|
48 |
return doc_classifier
|
49 |
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
@st.cache_data
|
52 |
+
def target_classification(haystack_doc:pd.DataFrame,
|
53 |
+
threshold:float = 0.5,
|
54 |
+
classifier_model:pipeline= None
|
55 |
)->Tuple[DataFrame,Series]:
|
56 |
"""
|
57 |
Text-Classification on the list of texts provided. Classifier provides the
|
|
|
76 |
if not classifier_model:
|
77 |
classifier_model = st.session_state['target_classifier']
|
78 |
|
79 |
+
results = classifier_model(list(haystack_doc.text))
|
80 |
+
labels_= [(l[0]['label'],
|
81 |
+
l[0]['score']) for l in results]
|
82 |
|
83 |
|
84 |
+
df1 = DataFrame(labels_, columns=["Target Label","Relevancy"])
|
85 |
+
df = pd.concat([haystack_doc,df1],axis=1)
|
86 |
|
87 |
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
88 |
df.index += 1
|
|
|
89 |
df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
return df
|
utils/uploadAndExample.py
CHANGED
@@ -11,6 +11,12 @@ def add_upload(choice):
|
|
11 |
"""
|
12 |
|
13 |
if choice == 'Upload Document':
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
uploaded_file = st.sidebar.file_uploader('Upload the File',
|
15 |
type=['pdf', 'docx', 'txt'])
|
16 |
if uploaded_file is not None:
|
|
|
11 |
"""
|
12 |
|
13 |
if choice == 'Upload Document':
|
14 |
+
|
15 |
+
if 'filename' in st.session_state:
|
16 |
+
# Delete all the items in Session state
|
17 |
+
for key in st.session_state.keys():
|
18 |
+
del st.session_state[key]
|
19 |
+
|
20 |
uploaded_file = st.sidebar.file_uploader('Upload the File',
|
21 |
type=['pdf', 'docx', 'txt'])
|
22 |
if uploaded_file is not None:
|