Spaces:

GIZ
/

CPo_droid

Sleeping

App Files Files Community

ppsingh commited on May 23, 2023

Commit

6d737a4

•

1 Parent(s): 031e5e2

Updating

Browse files

Files changed (17) hide show

app.py +33 -9
appStore/adapmit.py +76 -114
appStore/doc_processing.py +77 -0
appStore/ghg.py +95 -0
appStore/info.py +0 -67
appStore/multiapp.py +0 -67
appStore/netzero.py +34 -145
appStore/sector.py +53 -96
appStore/target.py +62 -162
paramconfig.cfg +21 -1
utils/adapmit_classifier.py +18 -53
utils/ghg_classifier.py +90 -0
utils/netzero_classifier.py +16 -63
utils/preprocessing.py +26 -10
utils/sector_classifier.py +20 -58
utils/target_classifier.py +16 -63
utils/uploadAndExample.py +6 -0

app.py CHANGED Viewed

@@ -2,19 +2,43 @@ import appStore.target as target_extraction
 import appStore.netzero as netzero
 import appStore.sector as sector
 import appStore.adapmit as adapmit
-# import appStore.info as info
-from appStore.multiapp import MultiApp
 import streamlit as st
 st.set_page_config(page_title = 'Climate Policy Intelligence',
                    initial_sidebar_state='expanded', layout="wide")
-app = MultiApp()
-# app.add_app("About","house", info.app)
-app.add_app("Economy-Wide Target Extraction","gear",target_extraction.app)
-app.add_app("NetZero Target Extraction","gear", netzero.app)
-app.add_app("Sector Classification","gear", sector.app)
-app.add_app("Adaptation-Mitigation","gear", adapmit.app)
-app.run()

 import appStore.netzero as netzero
 import appStore.sector as sector
 import appStore.adapmit as adapmit
+import appStore.ghg as ghg
+import appStore.doc_processing as processing
+from utils.uploadAndExample import add_upload
 import streamlit as st
 st.set_page_config(page_title = 'Climate Policy Intelligence',
                    initial_sidebar_state='expanded', layout="wide")
+with st.sidebar:
+    # upload and example doc
+    choice = st.sidebar.radio(label = 'Select the Document',
+                            help = 'You can upload the document \
+                            or else you can try a example document',
+                            options = ('Upload Document', 'Try Example'),
+                            horizontal = True)
+    add_upload(choice)
+with st.container():
+        st.markdown("<h2 style='text-align: center; color: black;'> Climate Policy Intelligence App </h2>", unsafe_allow_html=True)
+        st.write(' ')
+# with st.expander("ℹ️ - About this app", expanded=False):
+#     st.write(
+#         """
+#         Climate Policy Understanding App is an open-source\
+#         digital tool which aims to assist policy analysts and \
+#         other users in extracting and filtering relevant \
+#           information from public documents.
+#         """)
+#     st.write("")
+apps = [processing.app, target_extraction.app, netzero.app, ghg.app,
+        sector.app, adapmit.app]
+multiplier_val = int(100/len(apps))
+if st.button("Get the work done"):
+    prg = st.progress(0)
+    for i,func in enumerate(apps):
+        func()
+        prg.progress((i+1)*multiplier_val)
+    if 'key1' in st.session_state:
+        st.write(st.session_state.key1)

appStore/adapmit.py CHANGED Viewed

@@ -8,10 +8,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import streamlit as st
-# from st_aggrid import AgGrid
-# from st_aggrid.shared import ColumnsAutoSizeMode
-from utils.adapmit_classifier import adapmit_classification
-from utils.adapmit_classifier import runAdapMitPreprocessingPipeline, load_adapmitClassifier
 # from utils.keyword_extraction import textrank
 import logging
 logger = logging.getLogger(__name__)
@@ -48,122 +45,87 @@ def to_excel(df):
 def app():
-    #### APP INFO #####
-    with st.container():
-        st.markdown("<h1 style='text-align: center; color: black;'> Adaptation-Mitigation Classification </h1>", unsafe_allow_html=True)
-        st.write(' ')
-        st.write(' ')
-    with st.expander("ℹ️ - About this app", expanded=False):
-        st.write(
-            """
-            The **Adaptation-Mitigation Classification** app is an easy-to-use interface built \
-                in Streamlit for analyzing policy documents for \
-                 Classification of the paragraphs/texts in the document *If it \
-                belongs to 'Adaptation' and 'Mitigation' category or not. The paragraph \
-                can belong to both category too. \
-                 - developed by GIZ Data Service Center, GFA, IKI Tracs, \
-                 SV Klima and SPA. \n
-            """)
-        st.write("""**Document Processing:** The Uploaded/Selected document is \
-            automatically cleaned and split into paragraphs with a maximum \
-            length of 60 words using a Haystack preprocessing pipeline. The \
-            length of 60 is an empirical value which should reflect the length \
-            of a “context” and should limit the paragraph length deviation. \
-            However, since we want to respect the sentence boundary the limit \
-            can breach and hence this limit of 60 is tentative.  \n
-            """)
-        st.write("")
     ### Main app code ###
     with st.container():
-        if st.button("RUN Adaptation-Mitigation Classification"):
-            if 'key4' not in st.session_state:
-                st.session_state['key4'] = None
-            if 'filepath' in st.session_state:
-                file_name = st.session_state['filename']
-                file_path = st.session_state['filepath']
-                all_documents = runAdapMitPreprocessingPipeline(file_name= file_name,
-                                        file_path= file_path, split_by= params['split_by'],
-                                        split_length= params['split_length'],
-                split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
-                split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
-                classifier = load_adapmitClassifier(classifier_name=params['model_name'])
-                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
-                verified_paralist = paraLengthCheck(all_documents['paraList'], 100)
-                if len(verified_paralist) > 100:
-                    warning_msg = ": This might take sometime, please sit back and relax."
-                else:
-                    warning_msg = ""
-            #     # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
-                df = adapmit_classification(haystack_doc=verified_paralist,
-                                            threshold= params['threshold'])
-                threshold= params['threshold']
-                truth_df = df.drop(['text'],axis=1)
-                truth_df = truth_df.astype(float) >= threshold
-                truth_df = truth_df.astype(str)
-                categories = list(truth_df.columns)
-                placeholder = {}
-                for val in categories:
-                    placeholder[val] = dict(truth_df[val].value_counts())
-                count_df = pd.DataFrame.from_dict(placeholder)
-                count_df = count_df.T
-                count_df = count_df.reset_index()
-                # st.write(count_df)
-                placeholder  = []
-                for i in range(len(count_df)):
-                    placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
-                    placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
-                count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
-                # st.write("Total Paragraphs: {}".format(len(df)))
-                fig = px.bar(count_df, y='category', x='count',
-                            color='truth_value',orientation='h', height =200)
-                c1, c2 = st.columns([1,1])
-                with c1:
-                    st.plotly_chart(fig,use_container_width= True)
-                truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
-                truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
-                # st.write(truth_df)
-                df = pd.concat([df,truth_df['labels']],axis=1)
-                st.markdown("###### Top few 'Mitigation' related paragraph/text ######")
-                df = df.sort_values(by = ['Mitigation'], ascending=False)
-                for i in range(3):
-                    if df.iloc[i]['Mitigation'] >= 0.50:
-                        st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Mitigation']))
-                        st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
-                st.markdown("###### Top few 'Adaptation' related paragraph/text ######")
-                df = df.sort_values(by = ['Adaptation'], ascending=False)
-                for i in range(3):
-                    if df.iloc[i]['Adaptation'] > 0.5:
-                        st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Adaptation']))
-                        st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
-                # st.write(df[['text','labels']])
-                df['Validation'] =  'No'
-                df['Val-Mitigation'] = 'No'
-                df['Val-Adaptation'] = 'No'
-                df_xlsx = to_excel(df)
-                st.download_button(label='📥 Download Current Result',
-                                data=df_xlsx ,
-                              file_name= 'file_adaptation-mitigation.xlsx')
-                # st.session_state.key4 =
-                # category =set(df.columns)
-                # removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
-                # category  = list(category - removecols)
-            else:
-                st.info("🤔 No document found, please try to upload it at the sidebar!")
-                logging.warning("Terminated as no document provided")
         # # Creating truth value dataframe
         # if 'key4' in st.session_state:

 import numpy as np
 import pandas as pd
 import streamlit as st
+from utils.adapmit_classifier import load_adapmitClassifier,adapmit_classification
 # from utils.keyword_extraction import textrank
 import logging
 logger = logging.getLogger(__name__)
 def app():
     ### Main app code ###
     with st.container():
+        if 'key1' in st.session_state:
+            df = st.session_state.key1
+            classifier = load_adapmitClassifier(classifier_name=params['model_name'])
+            st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+            if sum(df['Target Label'] == 'TARGET') > 100:
+                warning_msg = ": This might take sometime, please sit back and relax."
+            else:
+                warning_msg = ""
+            df = adapmit_classification(haystack_doc=df,
+                                        threshold= params['threshold'])
+            st.session_state.key1 = df
+            #     threshold= params['threshold']
+            #     truth_df = df.drop(['text'],axis=1)
+            #     truth_df = truth_df.astype(float) >= threshold
+            #     truth_df = truth_df.astype(str)
+            #     categories = list(truth_df.columns)
+            #     placeholder = {}
+            #     for val in categories:
+            #         placeholder[val] = dict(truth_df[val].value_counts())
+            #     count_df = pd.DataFrame.from_dict(placeholder)
+            #     count_df = count_df.T
+            #     count_df = count_df.reset_index()
+            #     # st.write(count_df)
+            #     placeholder  = []
+            #     for i in range(len(count_df)):
+            #         placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
+            #         placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
+            #     count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
+            #     # st.write("Total Paragraphs: {}".format(len(df)))
+            #     fig = px.bar(count_df, y='category', x='count',
+            #                 color='truth_value',orientation='h', height =200)
+            #     c1, c2 = st.columns([1,1])
+            #     with c1:
+            #         st.plotly_chart(fig,use_container_width= True)
+            #     truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
+            #     truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
+            #     # st.write(truth_df)
+            #     df = pd.concat([df,truth_df['labels']],axis=1)
+            #     st.markdown("###### Top few 'Mitigation' related paragraph/text ######")
+            #     df = df.sort_values(by = ['Mitigation'], ascending=False)
+            #     for i in range(3):
+            #         if df.iloc[i]['Mitigation'] >= 0.50:
+            #             st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Mitigation']))
+            #             st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
+            #     st.markdown("###### Top few 'Adaptation' related paragraph/text ######")
+            #     df = df.sort_values(by = ['Adaptation'], ascending=False)
+            #     for i in range(3):
+            #         if df.iloc[i]['Adaptation'] > 0.5:
+            #             st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Adaptation']))
+            #             st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
+            #     # st.write(df[['text','labels']])
+            #     df['Validation'] =  'No'
+            #     df['Val-Mitigation'] = 'No'
+            #     df['Val-Adaptation'] = 'No'
+            #     df_xlsx = to_excel(df)
+            #     st.download_button(label='📥 Download Current Result',
+            #                     data=df_xlsx ,
+            #                   file_name= 'file_adaptation-mitigation.xlsx')
+            #     # st.session_state.key4 =
+            #     # category =set(df.columns)
+            #     # removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
+            #     # category  = list(category - removecols)
+            # else:
+            #     st.info("🤔 No document found, please try to upload it at the sidebar!")
+            #     logging.warning("Terminated as no document provided")
         # # Creating truth value dataframe
         # if 'key4' in st.session_state:

appStore/doc_processing.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+from typing import List, Tuple
+from typing_extensions import Literal
+from haystack.schema import Document
+from utils.config import get_classifier_params
+from utils.preprocessing import processingpipeline,paraLengthCheck
+import streamlit as st
+import logging
+import pandas as pd
+params  = get_classifier_params("preprocessing")
+@st.cache_data
+def runPreprocessingPipeline(file_name:str, file_path:str,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    Params
+    ------------
+    file_name: filename, in case of streamlit application use
+    st.session_state['filename']
+    file_path: filepath, in case of streamlit application use st.session_state['filepath']
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+                    it defines the length of paragraph.
+    split_respect_sentence_boundary: Used when using 'word' strategy for
+    splititng of text.
+    split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+    remove_punc: to remove all Punctuation including ',' and '.' or not
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the Haysatck implementation of SDG classification we,
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    processing_pipeline = processingpipeline()
+    output_pre = processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap, \
+        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
+    return output_pre
+def app():
+    with st.container():
+          if 'filepath' in st.session_state:
+              file_name = st.session_state['filename']
+              file_path = st.session_state['filepath']
+              all_documents = runPreprocessingPipeline(file_name= file_name,
+                              file_path= file_path, split_by= params['split_by'],
+                              split_length= params['split_length'],
+              split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
+              split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
+              paralist = paraLengthCheck(all_documents['documents'], 100)
+              df = pd.DataFrame(paralist,columns = ['text','page'])
+              # saving the dataframe to session state
+              st.session_state['key0'] = df
+          else:
+                st.info("🤔 No document found, please try to upload it at the sidebar!")
+                logging.warning("Terminated as no document provided")

appStore/ghg.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+#import needed libraries
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+from utils.ghg_classifier import load_ghgClassifier, ghg_classification
+import logging
+logger = logging.getLogger(__name__)
+from utils.config import get_classifier_params
+from io import BytesIO
+import xlsxwriter
+import plotly.express as px
+# Declare all the necessary variables
+classifier_identifier = 'ghg'
+params  = get_classifier_params(classifier_identifier)
+# Labels dictionary ###
+_lab_dict = {
+            'NEGATIVE':'NO GHG TARGET',
+            'NA':'NOT APPLICABLE',
+            'TARGET':'GHG TARGET',
+            }
+@st.cache_data
+def to_excel(df):
+    len_df = len(df)
+    output = BytesIO()
+    writer = pd.ExcelWriter(output, engine='xlsxwriter')
+    df.to_excel(writer, index=False, sheet_name='Sheet1')
+    workbook = writer.book
+    worksheet = writer.sheets['Sheet1']
+    worksheet.data_validation('E2:E{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': ['No', 'Yes', 'Discard']})
+    writer.save()
+    processed_data = output.getvalue()
+    return processed_data
+def app():
+    ### Main app code ###
+    with st.container():
+            if 'key1' in st.session_state:
+                df = st.session_state.key1
+                # Load the classifier model
+                classifier = load_ghgClassifier(classifier_name=params['model_name'])
+                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+                if sum(df['Target Label'] == 'TARGET') > 100:
+                    warning_msg = ": This might take sometime, please sit back and relax."
+                else:
+                    warning_msg = ""
+                df = ghg_classification(haystack_doc=df,
+                                            threshold= params['threshold'])
+                st.session_state.key1 = df
+def netzero_display():
+  if 'key1' in st.session_state:
+      df = st.session_state.key2
+      hits  = df[df['GHG Label'] == 'TARGET']
+      range_val = min(5,len(hits))
+      if range_val !=0:
+          count_df = df['GHG Label'].value_counts()
+          count_df = count_df.rename('count')
+          count_df = count_df.rename_axis('GHG Label').reset_index()
+          count_df['Label_def'] = count_df['GHG Label'].apply(lambda x: _lab_dict[x])
+          fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
+          c1, c2 = st.columns([1,1])
+          with c1:
+              st.plotly_chart(fig,use_container_width= True)
+          hits = hits.sort_values(by=['GHG Score'], ascending=False)
+          st.write("")
+          st.markdown("###### Top few GHG Target Classified paragraph/text results ######")
+          range_val = min(5,len(hits))
+          for i in range(range_val):
+              # the page number reflects the page that contains the main paragraph
+              # according to split limit, the overlapping part can be on a separate page
+              st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['GHG Score']))
+              st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
+      else:
+          st.info("🤔 No GHG target found")

appStore/info.py DELETED Viewed

@@ -1,67 +0,0 @@
-import streamlit as st
-import os
-from PIL import Image
-_ROOT = os.path.abspath(os.path.dirname(__file__))
-def get_data(path):
-    return os.path.join(_ROOT, 'data', path)
-def app():
-    with open('style.css') as f:
-        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
-    st.markdown("<h2 style='text-align: center;  \
-                      color: black;'> Climate Policy Understanding App</h2>",
-                      unsafe_allow_html=True)
-    st.markdown("<div style='text-align: center; \
-                    color: grey;'>Climate Policy Understanding App is an open-source\
-                         digital tool which aims to assist policy analysts and \
-                          other users in extracting and filtering relevant \
-                            information from public documents.</div>",
-                        unsafe_allow_html=True)
-    footer = """
-           <div class="footer-custom">
-               Guidance & Feedback - <a>Nadja Taeger</a> |<a>Marie Hertel</a> | <a>Cecile Schneider</a> |
-               Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a>  |
-               <a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
-           </div>
-       """
-    st.markdown(footer, unsafe_allow_html=True)
-    c1, c2, c3 =  st.columns([8,1,12])
-    with c1:
-        image = Image.open('docStore/img/ndc.png')
-        st.image(image)
-    with c3:
-        st.markdown('<div style="text-align: justify;">The manual extraction \
-        of relevant information from text documents is a \
-    time-consuming task for any policy analysts. As the amount and length of \
-    public policy documents in relation to sustainable development (such as \
-    National Development Plans and Nationally Determined Contributions) \
-    continuously increases, a major challenge for policy action tracking – the \
-    evaluation of stated goals and targets and their actual implementation on \
-    the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
-    Language Processing (NLP) methods can help in shortening and easing this \
-    task for policy analysts.</div><br>',
-    unsafe_allow_html=True)
-    intro = """
-    <div style="text-align: justify;">
-    For this purpose, IKI Tracs, SV KLIMA, SPA and Data Service Center (Deutsche Gesellschaft für Internationale \
-    Zusammenarbeit (GIZ) GmbH) are collaborating since 2022 in the development \
-    of an AI-powered open-source web application that helps find and extract \
-    relevant information from public policy documents faster to facilitate \
-    evidence-based decision-making processes in sustainable development and beyond.
-    </div>
-    <br>
-    """
-    st.markdown(intro, unsafe_allow_html=True)
-    image2  = Image.open('docStore/img/paris.png')
-    st.image(image2)

appStore/multiapp.py DELETED Viewed

@@ -1,67 +0,0 @@
-"""Frameworks for running multiple Streamlit applications as a single app.
-"""
-import streamlit as st
-from PIL import Image
-from utils.uploadAndExample import add_upload
-class MultiApp:
-    """Framework for combining multiple streamlit applications.
-    Usage:
-        def foo():
-            st.title("Hello Foo")
-        def bar():
-            st.title("Hello Bar")
-        app = MultiApp()
-        app.add_app("Foo", foo)
-        app.add_app("Bar", bar)
-        app.run()
-    It is also possible keep each application in a separate file.
-        import foo
-        import bar
-        app = MultiApp()
-        app.add_app("Foo", foo.app)
-        app.add_app("Bar", bar.app)
-        app.run()
-    """
-    def __init__(self):
-        self.apps = []
-    def add_app(self,title,icon, func):
-        """Adds a new application.
-        Parameters
-        ----------
-        func:
-            the python function to render this app.
-        title:
-            title of the app. Appears in the dropdown in the sidebar.
-        """
-        self.apps.append({
-            "title": title,
-            "icon": icon,
-            "function": func
-        })
-    def run(self):
-        st.sidebar.write(format_func=lambda app: app['title'])
-        #image = Image.open('docStore/img/dsc_giz.png')
-        #st.sidebar.image(image, width =200)
-        with st.sidebar:
-            selected = st.selectbox("Select the Task to perform", [page["title"] for page in self.apps],)
-            st.markdown("---")
-        for index, item in enumerate(self.apps):
-            if item["title"] == selected:
-                self.apps[index]["function"]()
-                break
-        choice = st.sidebar.radio(label = 'Select the Document',
-                            help = 'You can upload the document \
-                            or else you can try a example document',
-                            options = ('Upload Document', 'Try Example'),
-                            horizontal = True)
-        add_upload(choice)

appStore/netzero.py CHANGED Viewed

@@ -8,11 +8,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import streamlit as st
-# from st_aggrid import AgGrid
-# from st_aggrid.shared import ColumnsAutoSizeMode
-from utils.netzero_classifier import netzero_classification
-from utils.netzero_classifier import runNetZeroPreprocessingPipeline, load_netzeroClassifier
-# from utils.keyword_extraction import textrank
 import logging
 logger = logging.getLogger(__name__)
 from utils.config import get_classifier_params
@@ -28,6 +24,7 @@ params  = get_classifier_params(classifier_identifier)
 # Labels dictionary ###
 _lab_dict = {
             'NEGATIVE':'NO NETZERO TARGET',
             'NETZERO':'NETZERO TARGET',
             }
@@ -48,159 +45,51 @@ def to_excel(df):
     return processed_data
 def app():
-    #### APP INFO #####
-    with st.container():
-        st.markdown("<h1 style='text-align: center; color: black;'> NetZero Target Extraction </h1>", unsafe_allow_html=True)
-        st.write(' ')
-        st.write(' ')
-    with st.expander("ℹ️ - About this app", expanded=False):
-        st.write(
-            """
-            The **NetZero Extraction** app is an easy-to-use interface built \
-                in Streamlit for analyzing policy documents for \
-                 Classification of the paragraphs/texts in the document *If it \
-                contains any Net-Zero target related information* - \
-                developed by GIZ Data Service Center, GFA, IKI Tracs, \
-                 SV Klima and SPA. \n
-            """)
-        st.write("""**Document Processing:** The Uploaded/Selected document is \
-            automatically cleaned and split into paragraphs with a maximum \
-            length of 60 words using a Haystack preprocessing pipeline. The \
-            length of 60 is an empirical value which should reflect the length \
-            of a “context” and should limit the paragraph length deviation. \
-            However, since we want to respect the sentence boundary the limit \
-            can breach and hence this limit of 60 is tentative.  \n
-            """)
-        st.write("")
     ### Main app code ###
     with st.container():
-        if st.button("RUN NetZero Related Paragraph Extractions"):
-            if 'key2' not in st.session_state:
-                st.session_state['key2'] = None
-            if 'filepath' in st.session_state:
-                file_name = st.session_state['filename']
-                file_path = st.session_state['filepath']
-                # Do the preprocessing of the PDF
-                all_documents = runNetZeroPreprocessingPipeline(file_name= file_name,
-                                        file_path= file_path, split_by= params['split_by'],
-                                        split_length= params['split_length'],
-                split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
-                split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
-                # st.dataframe(all_documents['documents'])
                 # Load the classifier model
                 classifier = load_netzeroClassifier(classifier_name=params['model_name'])
                 st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
-                if len(all_documents['documents']) > 100:
                     warning_msg = ": This might take sometime, please sit back and relax."
                 else:
                     warning_msg = ""
-                # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
-                # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
-                df = netzero_classification(haystack_doc=all_documents['documents'],
                                             threshold= params['threshold'])
-                st.session_state.key2 = df
-                hits  = df[df['Target Label'] == 'NETZERO']
-                range_val = min(5,len(hits))
-                if range_val !=0:
-                    count_df = df['Target Label'].value_counts()
-                    count_df = count_df.rename('count')
-                    count_df = count_df.rename_axis('Target Label').reset_index()
-                    count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
-                    fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
-                    c1, c2 = st.columns([1,1])
-                    with c1:
-                        st.plotly_chart(fig,use_container_width= True)
-                    hits = hits.sort_values(by=['Relevancy'], ascending=False)
-                    st.write("")
-                    st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
-                    range_val = min(5,len(hits))
-                    for i in range(range_val):
-                        # the page number reflects the page that contains the main paragraph
-                        # according to split limit, the overlapping part can be on a separate page
-                        st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
-                        st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
-                else:
-                    st.info("🤔 No Netzero target found")
-                df['Validation'] =  'No'
-                df_xlsx = to_excel(df)
-                st.download_button(label='📥 Download Current Result',
-                                data=df_xlsx ,
-                                file_name= 'file_target.xlsx')
-            else:
-                st.info("🤔 No document found, please try to upload it at the sidebar!")
-                logging.warning("Terminated as no document provided")
-        # # Creating truth value dataframe
-        # if 'key2' in st.session_state:
-        #     if st.session_state.key2 is not None:
-        #         df = st.session_state.key2
-        #         st.markdown("###### Select the threshold for classifier ######")
-        #         c1, c2 = st.columns([1,1])
-        #         netzero_df = df[df['Target Label'] == 'NETZERO'].reset_index(drop = True)
-        #         if len(netzero_df) >0:
-        #             with c1:
-        #                 threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
-        #                                       step=0.01, value=0.5,
-        #                     help = "Keep High Value if want refined result, low if dont want to miss anything" )
-        #             # creating the dataframe for value counts of Labels, along with 'title' of Labels
-        #             temp = df[df['Relevancy']>threshold]
-        #             count_df = temp['Target Label'].value_counts()
-        #             count_df = count_df.rename('count')
-        #             count_df = count_df.rename_axis('Target Label').reset_index()
-        #             count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
-        #             plt.rcParams['font.size'] = 25
-        #             colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
-        #             # plot
-        #             fig, ax = plt.subplots()
-        #             ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
-        #                 wedgeprops={"linewidth": 1, "edgecolor": "white"},
-        #                 textprops={'fontsize': 14},
-        #                 frame=False,labels =list(count_df.Label_def),
-        #                 labeldistance=1.2)
-        #             st.markdown("#### Anything related to NetZero Targets? ####")
-        #             c4, c5, c6 = st.columns([1,2,2])
-        #             with c5:
-        #                 st.pyplot(fig)
-        #             with c6:
-        #                 st.write(count_df[['Label_def','count']])
-        #             st.write("")
-        #             st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
-        #             st.dataframe(netzero_df.head())
-        #         else:
-        #             st.write("🤔 No Results found")
-        #         df['Validation'] =  'No'
-        #         df_xlsx = to_excel(df)
-        #         st.download_button(label='📥 Download Current Result',
-        #                         data=df_xlsx ,
-        #                        file_name= 'file_netzero.xlsx')

 import numpy as np
 import pandas as pd
 import streamlit as st
+from utils.netzero_classifier import load_netzeroClassifier, netzero_classification
 import logging
 logger = logging.getLogger(__name__)
 from utils.config import get_classifier_params
 # Labels dictionary ###
 _lab_dict = {
             'NEGATIVE':'NO NETZERO TARGET',
+            'NA':'NOT APPLICABLE',
             'NETZERO':'NETZERO TARGET',
             }
     return processed_data
 def app():
     ### Main app code ###
     with st.container():
+            if 'key1' in st.session_state:
+                df = st.session_state.key1
                 # Load the classifier model
                 classifier = load_netzeroClassifier(classifier_name=params['model_name'])
                 st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+                if sum(df['Target Label'] == 'TARGET') > 100:
                     warning_msg = ": This might take sometime, please sit back and relax."
                 else:
                     warning_msg = ""
+                df = netzero_classification(haystack_doc=df,
                                             threshold= params['threshold'])
+                st.session_state.key1 = df
+def netzero_display():
+  if 'key1' in st.session_state:
+      df = st.session_state.key2
+      hits  = df[df['Netzero Label'] == 'NETZERO']
+      range_val = min(5,len(hits))
+      if range_val !=0:
+          count_df = df['Netzero Label'].value_counts()
+          count_df = count_df.rename('count')
+          count_df = count_df.rename_axis('Netzero Label').reset_index()
+          count_df['Label_def'] = count_df['Netzero Label'].apply(lambda x: _lab_dict[x])
+          fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
+          c1, c2 = st.columns([1,1])
+          with c1:
+              st.plotly_chart(fig,use_container_width= True)
+          hits = hits.sort_values(by=['Netzero Score'], ascending=False)
+          st.write("")
+          st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
+          range_val = min(5,len(hits))
+          for i in range(range_val):
+              # the page number reflects the page that contains the main paragraph
+              # according to split limit, the overlapping part can be on a separate page
+              st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Netzero Score']))
+              st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
+      else:
+          st.info("🤔 No Netzero target found")

appStore/sector.py CHANGED Viewed

@@ -8,11 +8,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import streamlit as st
-# from st_aggrid import AgGrid
-# from st_aggrid.shared import ColumnsAutoSizeMode
-from utils.sector_classifier import sector_classification
-from utils.sector_classifier import runSectorPreprocessingPipeline, load_sectorClassifier
-# from utils.keyword_extraction import textrank
 import logging
 logger = logging.getLogger(__name__)
 from utils.config import get_classifier_params
@@ -58,107 +54,68 @@ def to_excel(df,sectorlist):
 def app():
-    #### APP INFO #####
-    with st.container():
-        st.markdown("<h1 style='text-align: center; color: black;'> Sector Classification </h1>", unsafe_allow_html=True)
-        st.write(' ')
-        st.write(' ')
-    with st.expander("ℹ️ - About this app", expanded=False):
-        st.write(
-            """
-            The **Sector Classification** app is an easy-to-use interface built \
-                in Streamlit for analyzing policy documents for \
-                 Classification of the paragraphs/texts in the document *If it \
-                belongs to particular sector or not*. The paragraph can belong to multiple sectors - \
-                developed by GIZ Data Service Center, GFA, IKI Tracs, \
-                 SV Klima and SPA. \n
-            """)
-        st.write("""**Document Processing:** The Uploaded/Selected document is \
-            automatically cleaned and split into paragraphs with a maximum \
-            length of 60 words using a Haystack preprocessing pipeline. The \
-            length of 60 is an empirical value which should reflect the length \
-            of a “context” and should limit the paragraph length deviation. \
-            However, since we want to respect the sentence boundary the limit \
-            can breach and hence this limit of 60 is tentative.  \n
-            """)
-        st.write("")
     ### Main app code ###
     with st.container():
-        if st.button("RUN Sector Classification"):
-            if 'key' not in st.session_state:
-                st.session_state['key'] = None
-            if 'filepath' in st.session_state:
-                file_name = st.session_state['filename']
-                file_path = st.session_state['filepath']
-                all_documents = runSectorPreprocessingPipeline(file_name= file_name,
-                                        file_path= file_path, split_by= params['split_by'],
-                                        split_length= params['split_length'],
-                split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
-                split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
-                # st.write(all_documents['documents'])
                 classifier = load_sectorClassifier(classifier_name=params['model_name'])
                 st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
-                verified_paralist = paraLengthCheck(all_documents['paraList'], 100)
-                if len(verified_paralist) > 100:
                     warning_msg = ": This might take sometime, please sit back and relax."
                 else:
                     warning_msg = ""
-                # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
-                # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
-                df = sector_classification(haystack_doc=verified_paralist,
                                             threshold= params['threshold'])
-                # st.write(df)
-                threshold= params['threshold']
-                truth_df = df.drop(['text'],axis=1)
-                truth_df = truth_df.astype(float) >= threshold
-                truth_df = truth_df.astype(str)
-                categories = list(truth_df.columns)
-                placeholder = {}
-                for val in categories:
-                    placeholder[val] = dict(truth_df[val].value_counts())
-                count_df = pd.DataFrame.from_dict(placeholder)
-                count_df = count_df.T
-                count_df = count_df.reset_index()
-                # st.write(count_df)
-                placeholder  = []
-                for i in range(len(count_df)):
-                    placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
-                    placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
-                count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
-                # st.write("Total Paragraphs: {}".format(len(df)))
-                fig = px.bar(count_df, x='category', y='count',
-                            color='truth_value')
-                # c1, c2 = st.columns([1,1])
-                # with c1:
-                st.plotly_chart(fig,use_container_width= True)
-                truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
-                truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
-                # st.write(truth_df)
-                df = pd.concat([df,truth_df['labels']],axis=1)
-                df['Validation'] =  'No'
-                df['Sector1'] = 'Blank'
-                df['Sector2'] = 'Blank'
-                df['Sector3'] = 'Blank'
-                df['Sector4'] = 'Blank'
-                df['Sector5'] = 'Blank'
-                df_xlsx = to_excel(df,categories)
-                st.download_button(label='📥 Download Current Result',
-                                data=df_xlsx ,
-                              file_name= 'file_sector.xlsx')
-            else:
-                st.info("🤔 No document found, please try to upload it at the sidebar!")
-                logging.warning("Terminated as no document provided")
         # # Creating truth value dataframe
         # if 'key' in st.session_state:

 import numpy as np
 import pandas as pd
 import streamlit as st
+from utils.sector_classifier import load_sectorClassifier, sector_classification
 import logging
 logger = logging.getLogger(__name__)
 from utils.config import get_classifier_params
 def app():
     ### Main app code ###
     with st.container():
+            if 'key1' in st.session_state:
+                df = st.session_state.key1
                 classifier = load_sectorClassifier(classifier_name=params['model_name'])
                 st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+                if sum(df['Target Label'] == 'TARGET') > 100:
                     warning_msg = ": This might take sometime, please sit back and relax."
                 else:
                     warning_msg = ""
+                df = sector_classification(haystack_doc=df,
                                             threshold= params['threshold'])
+                st.session_state.key1 = df
+                # # st.write(df)
+                # threshold= params['threshold']
+                # truth_df = df.drop(['text'],axis=1)
+                # truth_df = truth_df.astype(float) >= threshold
+                # truth_df = truth_df.astype(str)
+                # categories = list(truth_df.columns)
+                # placeholder = {}
+                # for val in categories:
+                #     placeholder[val] = dict(truth_df[val].value_counts())
+                # count_df = pd.DataFrame.from_dict(placeholder)
+                # count_df = count_df.T
+                # count_df = count_df.reset_index()
+                # # st.write(count_df)
+                # placeholder  = []
+                # for i in range(len(count_df)):
+                #     placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
+                #     placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
+                # count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
+                # # st.write("Total Paragraphs: {}".format(len(df)))
+                # fig = px.bar(count_df, x='category', y='count',
+                #             color='truth_value')
+                # # c1, c2 = st.columns([1,1])
+                # # with c1:
+                # st.plotly_chart(fig,use_container_width= True)
+                # truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
+                # truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
+                # # st.write(truth_df)
+                # df = pd.concat([df,truth_df['labels']],axis=1)
+                # df['Validation'] =  'No'
+                # df['Sector1'] = 'Blank'
+                # df['Sector2'] = 'Blank'
+                # df['Sector3'] = 'Blank'
+                # df['Sector4'] = 'Blank'
+                # df['Sector5'] = 'Blank'
+                # df_xlsx = to_excel(df,categories)
+                # st.download_button(label='📥 Download Current Result',
+                #                 data=df_xlsx ,
+            #     #               file_name= 'file_sector.xlsx')
+            # else:
+            #     st.info("🤔 No document found, please try to upload it at the sidebar!")
+            #     logging.warning("Terminated as no document provided")
         # # Creating truth value dataframe
         # if 'key' in st.session_state:

appStore/target.py CHANGED Viewed

@@ -8,11 +8,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import streamlit as st
-# from st_aggrid import AgGrid
-# from st_aggrid.shared import ColumnsAutoSizeMode
-from utils.target_classifier import target_classification
-from utils.target_classifier import runTargetPreprocessingPipeline, load_targetClassifier
-# from utils.keyword_extraction import textrank
 import logging
 logger = logging.getLogger(__name__)
 from utils.config import get_classifier_params
@@ -26,8 +22,8 @@ params  = get_classifier_params(classifier_identifier)
 ## Labels dictionary ###
 _lab_dict = {
-            'LABEL_0':'NO TARGET INFO',
-            'LABEL_1':'ECONOMY-WIDE TARGET',
             }
 @st.cache_data
@@ -48,164 +44,68 @@ def to_excel(df):
 def app():
     #### APP INFO #####
-    with st.container():
-        st.markdown("<h1 style='text-align: center; color: black;'> Targets Extraction </h1>", unsafe_allow_html=True)
-        st.write(' ')
-        st.write(' ')
-    with st.expander("ℹ️ - About this app", expanded=False):
-        st.write(
-            """
-            The **Target Extraction** app is an easy-to-use interface built \
-                in Streamlit for analyzing policy documents for \
-                 Classification of the paragraphs/texts in the document *If it \
-                contains any Economy-Wide Targets related information* - \
-                developed by GIZ Data Service Center, GFA, IKI Tracs, \
-                 SV Klima and SPA. \n
-            """)
-        st.write("""**Document Processing:** The Uploaded/Selected document is \
-            automatically cleaned and split into paragraphs with a maximum \
-            length of 60 words using a Haystack preprocessing pipeline. The \
-            length of 60 is an empirical value which should reflect the length \
-            of a “context” and should limit the paragraph length deviation. \
-            However, since we want to respect the sentence boundary the limit \
-            can breach and hence this limit of 60 is tentative.  \n
-            """)
-        st.write("")
     ### Main app code ###
     with st.container():
-        if st.button("RUN Target Related Paragraph Extractions"):
-            if 'key1' not in st.session_state:
-                st.session_state['key1'] = None
-            if 'filepath' in st.session_state:
-                file_name = st.session_state['filename']
-                file_path = st.session_state['filepath']
-                all_documents = runTargetPreprocessingPipeline(file_name= file_name,
-                                        file_path= file_path, split_by= params['split_by'],
-                                        split_length= params['split_length'],
-                split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
-                split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
-                # st.write(all_documents['documents'])
-                #load Classifier
-                classifier = load_targetClassifier(classifier_name=params['model_name'])
-                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
-                if len(all_documents['documents']) > 100:
-                    warning_msg = ": This might take sometime, please sit back and relax."
-                else:
-                    warning_msg = ""
-                # #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
-                # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
-                df = target_classification(haystack_doc=all_documents['documents'],
-                                            threshold= params['threshold'])
-                st.session_state.key1 = df
-                # temp = df[df['Relevancy']>threshold]
-                hits  = df[df['Target Label'] == 'LABEL_1']
-                range_val = min(5,len(hits))
-                if range_val !=0:
-                    count_df = df['Target Label'].value_counts()
-                    count_df = count_df.rename('count')
-                    count_df = count_df.rename_axis('Target Label').reset_index()
-                    count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
-                    fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height=200)
-                    c1, c2 = st.columns([1,1])
-                    with c1:
-                        st.plotly_chart(fig,use_container_width= True)
-                    hits = hits.sort_values(by=['Relevancy'], ascending=False)
-                    st.write("")
-                    st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
-                    range_val = min(5,len(hits))
-                    for i in range(range_val):
-                        # the page number reflects the page that contains the main paragraph
-                        # according to split limit, the overlapping part can be on a separate page
-                        st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
-                        st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
-                else:
-                    st.info("🤔 No Economy Wide Target found")
-                df['Validation'] =  'No'
-                df_xlsx = to_excel(df)
-                st.download_button(label='📥 Download Current Result',
-                                data=df_xlsx ,
-                                file_name= 'file_target.xlsx')
             else:
-                st.info("🤔 No document found, please try to upload it at the sidebar!")
-                logging.warning("Terminated as no document provided")
-        # # Creating truth value dataframe
-        # if 'key1' in st.session_state:
-        #     if st.session_state.key1 is not None:
-        #         df = st.session_state.key1
-        #         st.markdown("###### Select the threshold for classifier ######")
-        #         c1, c2 = st.columns([1,1])
-        #         with c1:
-        #             threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
-        #                                   step=0.01, value=0.5,
-        #                 help = "Keep High Value if want refined result, low if dont want to miss anything" )
-        #         sectors =set(df.columns)
-        #         removecols = {'Validation','Sectors','text'}
-        #         sectors  = list(sectors - removecols)
-        #         # creating the dataframe for value counts of Labels, along with 'title' of Labels
-        #         temp = df[df['Relevancy']>threshold]
-        #         count_df = temp['Target Label'].value_counts()
-        #         count_df = count_df.rename('count')
-        #         count_df = count_df.rename_axis('Target Label').reset_index()
-        #         count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
-        #         plt.rcParams['font.size'] = 25
-        #         colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
-        #         # plot
-        #         fig, ax = plt.subplots()
-        #         ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
-        #             wedgeprops={"linewidth": 1, "edgecolor": "white"},
-        #             textprops={'fontsize': 14},
-        #             frame=False,labels =list(count_df.Label_def),
-        #             labeldistance=1.2)
-        #         st.markdown("#### Anything related to Targets? ####")
-        #         c4, c5, c6 = st.columns([1,2,2])
-        #         with c5:
-        #             st.pyplot(fig)
-        #         with c6:
-        #             st.write(count_df[['Label_def','count']])
-        #         st.write("")
-        #         st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
-        #         st.dataframe(df[df['Target Label'] == 'LABEL_1'].reset_index(drop = True))
-        #         df['Validation'] =  'No'
-        #         df_xlsx = to_excel(df)
-        #         st.download_button(label='📥 Download Current Result',
-        #                         data=df_xlsx ,
-        #                         file_name= 'file_target.xlsx')

 import numpy as np
 import pandas as pd
 import streamlit as st
+from utils.target_classifier import load_targetClassifier, target_classification
 import logging
 logger = logging.getLogger(__name__)
 from utils.config import get_classifier_params
 ## Labels dictionary ###
 _lab_dict = {
+            'NEGATIVE':'NO TARGET INFO',
+            'TARGET':'TARGET',
             }
 @st.cache_data
 def app():
     #### APP INFO #####
+    #     st.write(
+    #         """
+    #         The **Target Extraction** app is an easy-to-use interface built \
+    #             in Streamlit for analyzing policy documents for \
+    #              Classification of the paragraphs/texts in the document *If it \
+    #             contains any Economy-Wide Targets related information* - \
+    #             developed by GIZ Data Service Center, GFA, IKI Tracs, \
+    #              SV Klima and SPA. \n
+    #         """)
     ### Main app code ###
     with st.container():
+        if 'key0' in st.session_state:
+            df = st.session_state.key0
+            #load Classifier
+            classifier = load_targetClassifier(classifier_name=params['model_name'])
+            st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+            if len(df) > 100:
+                warning_msg = ": This might take sometime, please sit back and relax."
             else:
+                warning_msg = ""
+            df  = target_classification(haystack_doc=df,
+                                    threshold= params['threshold'])
+            st.session_state.key1 = df
+          # # excel part
+            # temp = df[df['Relevancy']>threshold]
+            # df['Validation'] =  'No'
+            # df_xlsx = to_excel(df)
+            # st.download_button(label='�� Download Current Result',
+            #                 data=df_xlsx ,
+            #                 file_name= 'file_target.xlsx')
+def target_display():
+    if  'key1' in st.session_state:
+        df = st.session_state.key1
+        hits  = df[df['Target Label'] == 'TARGET']
+        range_val = min(5,len(hits))
+        if range_val !=0:
+            count_df = df['Target Label'].value_counts()
+            count_df = count_df.rename('count')
+            count_df = count_df.rename_axis('Target Label').reset_index()
+            count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
+            fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height=200)
+            c1, c2 = st.columns([1,1])
+            with c1:
+                st.plotly_chart(fig,use_container_width= True)
+            hits = hits.sort_values(by=['Relevancy'], ascending=False)
+            st.write("")
+            st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
+            range_val = min(5,len(hits))
+            for i in range(range_val):
+                # the page number reflects the page that contains the main paragraph
+                # according to split limit, the overlapping part can be on a separate page
+                st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
+                st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
+        else:
+            st.info("🤔 No Targets found")

paramconfig.cfg CHANGED Viewed

@@ -1,6 +1,16 @@
 [target]
 THRESHOLD = 0.50
-MODEL = mtyrrell/ikitracs_economywide
 SPLIT_BY = word
 REMOVE_PUNC = 0
 SPLIT_LENGTH = 60
@@ -36,4 +46,14 @@ REMOVE_PUNC = 0
 SPLIT_LENGTH = 60
 SPLIT_OVERLAP = 10
 RESPECT_SENTENCE_BOUNDARY = 1
 TOP_KEY = 10

+[preprocessing]
+THRESHOLD = 0.50
+MODEL = garbage
+SPLIT_BY = word
+REMOVE_PUNC = 0
+SPLIT_LENGTH = 60
+SPLIT_OVERLAP = 10
+RESPECT_SENTENCE_BOUNDARY = 1
+TOP_KEY = 10
 [target]
 THRESHOLD = 0.50
+MODEL = mtyrrell/ikitracs_target_mpnet
 SPLIT_BY = word
 REMOVE_PUNC = 0
 SPLIT_LENGTH = 60
 SPLIT_LENGTH = 60
 SPLIT_OVERLAP = 10
 RESPECT_SENTENCE_BOUNDARY = 1
+TOP_KEY = 10
+[ghg]
+THRESHOLD = 0.50
+MODEL = mtyrrell/ikitracs_transport_ghg
+SPLIT_BY = word
+REMOVE_PUNC = 0
+SPLIT_LENGTH = 60
+SPLIT_OVERLAP = 10
+RESPECT_SENTENCE_BOUNDARY = 1
 TOP_KEY = 10

utils/adapmit_classifier.py CHANGED Viewed

@@ -34,10 +34,6 @@ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
             classifier_name = config.get('adapmit','MODEL')
     logging.info("Loading Adaptation Mitigation classifier")
-    # doc_classifier = TransformersDocumentClassifier(
-    #                     model_name_or_path=classifier_name,
-    #                     task="text-classification",
-    #                     top_k = None)
     doc_classifier = pipeline("text-classification",
                             model=classifier_name,
                             return_all_scores=True,
@@ -47,51 +43,8 @@ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
     return doc_classifier
-def runAdapMitPreprocessingPipeline(file_name:str, file_path:str,
-            split_by: Literal["sentence", "word"] = 'sentence',
-            split_length:int = 2, split_respect_sentence_boundary:bool = False,
-            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
-    """
-    creates the pipeline and runs the preprocessing pipeline,
-    the params for pipeline are fetched from paramconfig
-    Params
-    ------------
-    file_name: filename, in case of streamlit application use
-    st.session_state['filename']
-    file_path: filepath, in case of streamlit application use st.session_state['filepath']
-    split_by: document splitting strategy either as word or sentence
-    split_length: when synthetically creating the paragrpahs from document,
-                    it defines the length of paragraph.
-    split_respect_sentence_boundary: Used when using 'word' strategy for
-    splititng of text.
-    split_overlap: Number of words or sentences that overlap when creating
-        the paragraphs. This is done as one sentence or 'some words' make sense
-        when  read in together with others. Therefore the overlap is used.
-    remove_punc: to remove all Punctuation including ',' and '.' or not
-    Return
-    --------------
-    List[Document]: When preprocessing pipeline is run, the output dictionary
-    has four objects. For the Haysatck implementation of SDG classification we,
-    need to use the List of Haystack Document, which can be fetched by
-    key = 'documents' on output.
-    """
-    adapmit_processing_pipeline = processingpipeline()
-    output_adapmit_pre = adapmit_processing_pipeline.run(file_paths = file_path,
-                            params= {"FileConverter": {"file_path": file_path, \
-                                        "file_name": file_name},
-                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
-                                            "split_by": split_by, \
-                                            "split_length":split_length,\
-                                            "split_overlap": split_overlap, \
-        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
-    return output_adapmit_pre
 @st.cache_data
-def adapmit_classification(haystack_doc:List[Document],
                         threshold:float = 0.5,
                         classifier_model:pipeline= None
                         )->Tuple[DataFrame,Series]:
@@ -115,10 +68,14 @@ def adapmit_classification(haystack_doc:List[Document],
     the number of times it is covered/discussed/count_of_paragraphs.
     """
     logging.info("Working on Adaptation-Mitigation Identification")
     if not classifier_model:
         classifier_model = st.session_state['adapmit_classifier']
-    predictions = classifier_model(haystack_doc)
      # converting the predictions to desired format
     list_ = []
     for i in range(len(predictions)):
@@ -128,9 +85,17 @@ def adapmit_classification(haystack_doc:List[Document],
       for j in range(len(temp)):
         placeholder[temp[j]['label']] = temp[j]['score']
       list_.append(placeholder)
-    labels_ = [{**{'text':haystack_doc[l]},**list_[l]} for l in range(len(predictions))]
-    # labels_= [{**l.meta['classification']['details'],**{'text':l.content}} for l in results]
-    df = DataFrame.from_dict(labels_)
-    df = df.round(2)
     return df

             classifier_name = config.get('adapmit','MODEL')
     logging.info("Loading Adaptation Mitigation classifier")
     doc_classifier = pipeline("text-classification",
                             model=classifier_name,
                             return_all_scores=True,
     return doc_classifier
 @st.cache_data
+def adapmit_classification(haystack_doc:pd.DataFrame,
                         threshold:float = 0.5,
                         classifier_model:pipeline= None
                         )->Tuple[DataFrame,Series]:
     the number of times it is covered/discussed/count_of_paragraphs.
     """
     logging.info("Working on Adaptation-Mitigation Identification")
+    haystack_doc['Adapt-Mitig Label'] = 'NA'
+    df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
+    df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
     if not classifier_model:
         classifier_model = st.session_state['adapmit_classifier']
+    predictions = classifier_model(list(df1.text))
      # converting the predictions to desired format
     list_ = []
     for i in range(len(predictions)):
       for j in range(len(temp)):
         placeholder[temp[j]['label']] = temp[j]['score']
       list_.append(placeholder)
+    labels_ = [{**list_[l]} for l in range(len(predictions))]
+    truth_df = DataFrame.from_dict(labels_)
+    truth_df = truth_df.round(2)
+    truth_df = truth_df.astype(float) >= threshold
+    truth_df = truth_df.astype(str)
+    categories = list(truth_df.columns)
+    truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
+                                        else None for i in categories}, axis=1)
+    truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
+                                  list(x['Adapt-Mitig Label'] -{None}),axis=1)
+    df1['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
+    df = pd.concat([df,df1])
     return df

utils/ghg_classifier.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from haystack.nodes import TransformersDocumentClassifier
+from haystack.schema import Document
+from typing import List, Tuple
+from typing_extensions import Literal
+import logging
+import pandas as pd
+from pandas import DataFrame, Series
+from utils.config import getconfig
+from utils.preprocessing import processingpipeline
+import streamlit as st
+from transformers import pipeline
+# Labels dictionary ###
+_lab_dict = {
+            'NEGATIVE':'NO GHG TARGET',
+            'TARGET':'GHG TARGET',
+            }
+@st.cache_resource
+def load_ghgClassifier(config_file:str = None, classifier_name:str = None):
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
+    Return: document classifier model
+    """
+    if not classifier_name:
+        if not config_file:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(config_file)
+            classifier_name = config.get('ghg','MODEL')
+    logging.info("Loading ghg classifier")
+    doc_classifier = pipeline("text-classification",
+                            model=classifier_name,
+                            top_k =1)
+    return doc_classifier
+@st.cache_data
+def ghg_classification(haystack_doc:pd.DataFrame,
+                        threshold:float = 0.5,
+                        classifier_model:pipeline= None
+                        )->Tuple[DataFrame,Series]:
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. these labels are in terms of if text
+    belongs to which particular Sustainable Devleopment Goal (SDG).
+    Params
+    ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
+    contains the list of paragraphs in different format,here the list of
+    Haystack Documents is used.
+    threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
+    Returns
+    ----------
+    df: Dataframe with two columns['SDG:int', 'text']
+    x: Series object with the unique SDG covered in the document uploaded and
+    the number of times it is covered/discussed/count_of_paragraphs.
+    """
+    logging.info("Working on GHG Extraction")
+    haystack_doc['GHG Label'] = 'NA'
+    haystack_doc['GHG Score'] = 'NA'
+    temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
+    df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
+    if not classifier_model:
+        classifier_model = st.session_state['ghg_classifier']
+    results = classifier_model(list(temp.text))
+    labels_= [(l[0]['label'],l[0]['score']) for l in results]
+    temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
+    df = pd.concat([df,temp])
+    df = df.reset_index(drop =True)
+    df.index += 1
+    return df

utils/netzero_classifier.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pandas import DataFrame, Series
 from utils.config import getconfig
 from utils.preprocessing import processingpipeline
 import streamlit as st
 # Labels dictionary ###
 _lab_dict = {
@@ -39,60 +40,17 @@ def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
             classifier_name = config.get('netzero','MODEL')
     logging.info("Loading netzero classifier")
-    doc_classifier = TransformersDocumentClassifier(
-                        model_name_or_path=classifier_name,
-                        task="text-classification")
     return doc_classifier
-def runNetZeroPreprocessingPipeline(file_name:str, file_path:str,
-            split_by: Literal["sentence", "word"] = 'sentence',
-            split_length:int = 2, split_respect_sentence_boundary:bool = False,
-            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
-    """
-    creates the pipeline and runs the preprocessing pipeline,
-    the params for pipeline are fetched from paramconfig
-    Params
-    ------------
-    file_name: filename, in case of streamlit application use
-    st.session_state['filename']
-    file_path: filepath, in case of streamlit application use st.session_state['filepath']
-    split_by: document splitting strategy either as word or sentence
-    split_length: when synthetically creating the paragrpahs from document,
-                    it defines the length of paragraph.
-    split_respect_sentence_boundary: Used when using 'word' strategy for
-    splititng of text.
-    split_overlap: Number of words or sentences that overlap when creating
-        the paragraphs. This is done as one sentence or 'some words' make sense
-        when  read in together with others. Therefore the overlap is used.
-    remove_punc: to remove all Punctuation including ',' and '.' or not
-    Return
-    --------------
-    List[Document]: When preprocessing pipeline is run, the output dictionary
-    has four objects. For the Haysatck implementation of SDG classification we,
-    need to use the List of Haystack Document, which can be fetched by
-    key = 'documents' on output.
-    """
-    netzero_processing_pipeline = processingpipeline()
-    output_netzero_pre = netzero_processing_pipeline.run(file_paths = file_path,
-                            params= {"FileConverter": {"file_path": file_path, \
-                                        "file_name": file_name},
-                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
-                                            "split_by": split_by, \
-                                            "split_length":split_length,\
-                                            "split_overlap": split_overlap, \
-        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
-    return output_netzero_pre
 @st.cache_data
-def netzero_classification(haystack_doc:List[Document],
                         threshold:float = 0.8,
-                        classifier_model:TransformersDocumentClassifier= None
                         )->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
@@ -114,24 +72,19 @@ def netzero_classification(haystack_doc:List[Document],
     the number of times it is covered/discussed/count_of_paragraphs.
     """
     logging.info("Working on Netzero Extraction")
     if not classifier_model:
         classifier_model = st.session_state['netzero_classifier']
-    results = classifier_model.predict(haystack_doc)
-    labels_= [(l.meta['classification']['label'],
-            l.meta['classification']['score'],l.meta['page'],l.content,) for l in results]
-    df = DataFrame(labels_, columns=["Target Label","Relevancy", "page","text"])
-    df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
     df.index += 1
-    # df =df[df['Relevancy']>threshold]
-    df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
-    # creating the dataframe for value counts of Labels, along with 'title' of Labels
-    # count_df = df['Target Label'].value_counts()
-    # count_df = count_df.rename('count')
-    # count_df = count_df.rename_axis('Target Label').reset_index()
-    # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
     return df

 from utils.config import getconfig
 from utils.preprocessing import processingpipeline
 import streamlit as st
+from transformers import pipeline
 # Labels dictionary ###
 _lab_dict = {
             classifier_name = config.get('netzero','MODEL')
     logging.info("Loading netzero classifier")
+    doc_classifier = pipeline("text-classification",
+                            model=classifier_name,
+                            top_k =1)
     return doc_classifier
 @st.cache_data
+def netzero_classification(haystack_doc:pd.DataFrame,
                         threshold:float = 0.8,
+                        classifier_model:pipeline= None
                         )->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
     the number of times it is covered/discussed/count_of_paragraphs.
     """
     logging.info("Working on Netzero Extraction")
+    haystack_doc['Netzero Label'] = 'NA'
+    haystack_doc['Netzero Score'] = 'NA'
+    temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
+    df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
     if not classifier_model:
         classifier_model = st.session_state['netzero_classifier']
+    results = classifier_model(list(temp.text))
+    labels_= [(l[0]['label'],l[0]['score']) for l in results]
+    temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
+    df = pd.concat([df,temp])
+    df = df.reset_index(drop =True)
     df.index += 1
     return df

utils/preprocessing.py CHANGED Viewed

@@ -150,20 +150,36 @@ def basic(s:str, remove_punc:bool = False):
     return s.strip()
-def paraLengthCheck(paraList, max_len = 512):
     new_para_list = []
     for passage in paraList:
-        if len(passage.split()) > max_len:
-            iterations = int(len(passage.split())/max_len)
-        #     # st.write("Splitting")
             for i in range(iterations):
-                temp  = " ".join(passage.split()[max_len*i:max_len*(i+1)])
-                new_para_list.append(temp)
-            temp  = " ".join(passage.split()[max_len*(i+1):])
-            new_para_list.append(temp)
         else:
-            new_para_list.append(passage)
     return new_para_list
 class UdfPreProcessor(BaseComponent):

     return s.strip()
+def paraLengthCheck(paraList, max_len = 100):
+    """
+    There are cases where preprocessor cannot respect word limit, when using
+    respect sentence boundary flag due to missing sentence boundaries.
+    Therefore we run one more round of split here for those paragraphs
+    Params
+    ---------------
+    paraList : list of paragraphs/text
+    max_len : max length to be respected by sentences which bypassed
+              preprocessor strategy
+    """
     new_para_list = []
     for passage in paraList:
+        # check if para exceeds words limit
+        if len(passage.content.split()) > max_len:
+          # we might need few iterations example if para = 512 tokens
+          # we need to iterate 5 times to reduce para to size limit of '100'
+            iterations = int(len(passage.content.split())/max_len)
             for i in range(iterations):
+                temp  = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
+                new_para_list.append((temp,passage.meta['page']))
+            temp  = " ".join(passage.content.split()[max_len*(i+1):])
+            new_para_list.append((temp,passage.meta['page']))
         else:
+            # paragraphs which dont need any splitting
+            new_para_list.append((passage.content, passage.meta['page']))
+    logging.info("New paragraphs length {}".format(len(new_para_list)))
     return new_para_list
 class UdfPreProcessor(BaseComponent):

utils/sector_classifier.py CHANGED Viewed

@@ -11,12 +11,6 @@ from haystack.nodes import TransformersDocumentClassifier
 from transformers import pipeline
-# # Labels dictionary ###
-# _lab_dict = {
-#             'NEGATIVE':'NO NETZERO TARGET',
-#             'NETZERO':'NETZERO TARGET',
-#             }
 @st.cache_resource
 def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
     """
@@ -58,53 +52,10 @@ def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
     return doc_classifier
-def runSectorPreprocessingPipeline(file_name:str, file_path:str,
-            split_by: Literal["sentence", "word"] = 'sentence',
-            split_length:int = 2, split_respect_sentence_boundary:bool = False,
-            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
-    """
-    creates the pipeline and runs the preprocessing pipeline,
-    the params for pipeline are fetched from paramconfig
-    Params
-    ------------
-    file_name: filename, in case of streamlit application use
-    st.session_state['filename']
-    file_path: filepath, in case of streamlit application use st.session_state['filepath']
-    split_by: document splitting strategy either as word or sentence
-    split_length: when synthetically creating the paragrpahs from document,
-                    it defines the length of paragraph.
-    split_respect_sentence_boundary: Used when using 'word' strategy for
-    splititng of text.
-    split_overlap: Number of words or sentences that overlap when creating
-        the paragraphs. This is done as one sentence or 'some words' make sense
-        when  read in together with others. Therefore the overlap is used.
-    remove_punc: to remove all Punctuation including ',' and '.' or not
-    Return
-    --------------
-    List[Document]: When preprocessing pipeline is run, the output dictionary
-    has four objects. For the Haysatck implementation of SDG classification we,
-    need to use the List of Haystack Document, which can be fetched by
-    key = 'documents' on output.
-    """
-    sector_processing_pipeline = processingpipeline()
-    output_sector_pre = sector_processing_pipeline.run(file_paths = file_path,
-                            params= {"FileConverter": {"file_path": file_path, \
-                                        "file_name": file_name},
-                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
-                                            "split_by": split_by, \
-                                            "split_length":split_length,\
-                                            "split_overlap": split_overlap, \
-        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
-    return output_sector_pre
 @st.cache_data
-def sector_classification(haystack_doc:List[Document],
-                        threshold:float = 0.8,
-                        classifier_model:TransformersDocumentClassifier= None
                         )->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
@@ -126,10 +77,14 @@ def sector_classification(haystack_doc:List[Document],
     the number of times it is covered/discussed/count_of_paragraphs.
     """
     logging.info("Working on Sector Identification")
     if not classifier_model:
         classifier_model = st.session_state['sector_classifier']
-        predictions = classifier_model(haystack_doc)
     list_ = []
     for i in range(len(predictions)):
@@ -138,9 +93,16 @@ def sector_classification(haystack_doc:List[Document],
       for j in range(len(temp)):
         placeholder[temp[j]['label']] = temp[j]['score']
       list_.append(placeholder)
-    labels_ = [{**{'text':haystack_doc[l]},**list_[l]} for l in range(len(predictions))]
-    # labels_= [{**l.meta['classification']['details'],**{'text':l.content}} for l in results]
-    df = DataFrame.from_dict(labels_)
-    df = df.round(2)
     return df

 from transformers import pipeline
 @st.cache_resource
 def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
     """
     return doc_classifier
 @st.cache_data
+def sector_classification(haystack_doc:pd.DataFrame,
+                        threshold:float = 0.5,
+                        classifier_model:pipeline= None
                         )->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
     the number of times it is covered/discussed/count_of_paragraphs.
     """
     logging.info("Working on Sector Identification")
+    haystack_doc['Sector Label'] = 'NA'
+    df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
+    df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
     if not classifier_model:
         classifier_model = st.session_state['sector_classifier']
+        predictions = classifier_model(list(df1.text))
     list_ = []
     for i in range(len(predictions)):
       for j in range(len(temp)):
         placeholder[temp[j]['label']] = temp[j]['score']
       list_.append(placeholder)
+    labels_ = [{**list_[l]} for l in range(len(predictions))]
+    truth_df = DataFrame.from_dict(labels_)
+    truth_df = truth_df.round(2)
+    truth_df = truth_df.astype(float) >= threshold
+    truth_df = truth_df.astype(str)
+    categories = list(truth_df.columns)
+    truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
+                                              None for i in categories}, axis=1)
+    truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
+                                                            -{None}),axis=1)
+    df1['Sector Label'] = list(truth_df['Sector Label'])
+    df = pd.concat([df,df1])
     return df

utils/target_classifier.py CHANGED Viewed

@@ -8,11 +8,12 @@ from pandas import DataFrame, Series
 from utils.config import getconfig
 from utils.preprocessing import processingpipeline
 import streamlit as st
 ## Labels dictionary ###
 _lab_dict = {
-            'LABEL_0':'NO TARGET INFO',
-            'LABEL_1':'ECONOMY-WIDE TARGET',
             }
 @st.cache_resource
@@ -38,61 +39,19 @@ def load_targetClassifier(config_file:str = None, classifier_name:str = None):
             config = getconfig(config_file)
             classifier_name = config.get('target','MODEL')
-    logging.info("Loading classifier")
-    doc_classifier = TransformersDocumentClassifier(
-                        model_name_or_path=classifier_name,
-                        task="text-classification")
     return doc_classifier
-def runTargetPreprocessingPipeline(file_name:str, file_path:str,
-            split_by: Literal["sentence", "word"] = 'sentence',
-            split_length:int = 2, split_respect_sentence_boundary:bool = False,
-            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
-    """
-    creates the pipeline and runs the preprocessing pipeline,
-    the params for pipeline are fetched from paramconfig
-    Params
-    ------------
-    file_name: filename, in case of streamlit application use
-    st.session_state['filename']
-    file_path: filepath, in case of streamlit application use st.session_state['filepath']
-    split_by: document splitting strategy either as word or sentence
-    split_length: when synthetically creating the paragrpahs from document,
-                    it defines the length of paragraph.
-    split_respect_sentence_boundary: Used when using 'word' strategy for
-    splititng of text.
-    split_overlap: Number of words or sentences that overlap when creating
-        the paragraphs. This is done as one sentence or 'some words' make sense
-        when  read in together with others. Therefore the overlap is used.
-    remove_punc: to remove all Punctuation including ',' and '.' or not
-    Return
-    --------------
-    List[Document]: When preprocessing pipeline is run, the output dictionary
-    has four objects. For the Haysatck implementation of SDG classification we,
-    need to use the List of Haystack Document, which can be fetched by
-    key = 'documents' on output.
-    """
-    target_processing_pipeline = processingpipeline()
-    output_target_pre = target_processing_pipeline.run(file_paths = file_path,
-                            params= {"FileConverter": {"file_path": file_path, \
-                                        "file_name": file_name},
-                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
-                                            "split_by": split_by, \
-                                            "split_length":split_length,\
-                                            "split_overlap": split_overlap, \
-        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
-    return output_target_pre
 @st.cache_data
-def target_classification(haystack_doc:List[Document],
-                        threshold:float = 0.8,
-                        classifier_model:TransformersDocumentClassifier= None
                         )->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
@@ -117,22 +76,16 @@ def target_classification(haystack_doc:List[Document],
     if not classifier_model:
         classifier_model = st.session_state['target_classifier']
-    results = classifier_model.predict(haystack_doc)
-    labels_= [(l.meta['classification']['label'],
-               l.meta['classification']['score'],l.meta['page'],l.content,) for l in results]
-    df = DataFrame(labels_, columns=["Target Label","Relevancy","page","text"])
     df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
     df.index += 1
-    # df =df[df['Relevancy']>threshold]
     df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
-    # creating the dataframe for value counts of Labels, along with 'title' of Labels
-    # count_df = df['Target Label'].value_counts()
-    # count_df = count_df.rename('count')
-    # count_df = count_df.rename_axis('Target Label').reset_index()
-    # count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
     return df

 from utils.config import getconfig
 from utils.preprocessing import processingpipeline
 import streamlit as st
+from transformers import pipeline
 ## Labels dictionary ###
 _lab_dict = {
+            'NEGATIVE':'NO TARGET INFO',
+            'TARGET':'TARGET',
             }
 @st.cache_resource
             config = getconfig(config_file)
             classifier_name = config.get('target','MODEL')
+    logging.info("Loading classifier")
+    doc_classifier = pipeline("text-classification",
+                            model=classifier_name,
+                            top_k =1)
     return doc_classifier
 @st.cache_data
+def target_classification(haystack_doc:pd.DataFrame,
+                        threshold:float = 0.5,
+                        classifier_model:pipeline= None
                         )->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
     if not classifier_model:
         classifier_model = st.session_state['target_classifier']
+    results = classifier_model(list(haystack_doc.text))
+    labels_= [(l[0]['label'],
+               l[0]['score']) for l in results]
+    df1 = DataFrame(labels_, columns=["Target Label","Relevancy"])
+    df = pd.concat([haystack_doc,df1],axis=1)
     df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
     df.index += 1
     df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
     return df

utils/uploadAndExample.py CHANGED Viewed

@@ -11,6 +11,12 @@ def add_upload(choice):
     """
     if choice == 'Upload Document':
         uploaded_file = st.sidebar.file_uploader('Upload the File',
                             type=['pdf', 'docx', 'txt'])
         if uploaded_file is not None:

     """
     if choice == 'Upload Document':
+        if 'filename' in st.session_state:
+          # Delete all the items in Session state
+            for key in st.session_state.keys():
+                del st.session_state[key]
         uploaded_file = st.sidebar.file_uploader('Upload the File',
                             type=['pdf', 'docx', 'txt'])
         if uploaded_file is not None: