Spaces:

GIZ
/

climate_vulnerability_analysis

Running on T4

App Files Files Community

ppsingh commited on Oct 17

Commit

05828e0

•

1 Parent(s): bdab6a0

adding appstore and utils main scripts

Browse files

Files changed (7) hide show

app.py +30 -1
appStore/__init__.py +1 -0
appStore/doc_processing.py +80 -0
appStore/target.py +111 -0
appStore/vulnerability_analysis.py +169 -0
utils/target_classifier.py +125 -0
utils/vulnerability_classifier.py +137 -1

app.py CHANGED Viewed

@@ -2,6 +2,9 @@ import streamlit as st
 from utils.uploadAndExample import add_upload
 from utils.config import model_dict
 from utils.vulnerability_classifier import label_dict
 with st.sidebar:
     # upload and example doc
@@ -23,4 +26,30 @@ with st.sidebar:
 with st.container():
     st.markdown("<h2 style='text-align: center;'> Vulnerability Analysis 3.1 </h2>", unsafe_allow_html=True)
-    st.write(' ')

 from utils.uploadAndExample import add_upload
 from utils.config import model_dict
 from utils.vulnerability_classifier import label_dict
+import appStore.doc_processing as processing
+import appStore.vulnerability_analysis as vulnerability_analysis
+import appStore.target as target_analysis
 with st.sidebar:
     # upload and example doc
 with st.container():
     st.markdown("<h2 style='text-align: center;'> Vulnerability Analysis 3.1 </h2>", unsafe_allow_html=True)
+    st.write(' ')
+with st.expander("ℹ️ - About this app", expanded=False):
+    st.write(
+        """
+        The Vulnerability Analysis App is an open-source\
+        digital tool which aims to assist policy analysts and \
+        other users in extracting and filtering references \
+        to different groups in vulnerable situations from public documents. \
+        We use Natural Language Processing (NLP), specifically deep \
+        learning-based text representations  to search context-sensitively \
+        for mentions of the special needs of groups in vulnerable situations
+        to cluster them thematically.
+        For more understanding on Methodology [Click Here](https://vulnerability-analysis.streamlit.app/)
+        """)
+    st.write("""
+        What Happens in background?
+        - Step 1: Once the document is provided to app, it undergoes *Pre-processing*.\
+        In this step the document is broken into smaller paragraphs \
+        (based on word/sentence count).
+        - Step 2: The paragraphs are then fed to the **Vulnerability Classifier** which detects if
+        the paragraph contains any or multiple references to vulnerable groups.
+        """)
+    st.write("")

appStore/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # more app related files

appStore/doc_processing.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+from typing import List, Tuple
+from typing_extensions import Literal
+from haystack.schema import Document
+from utils.config import get_classifier_params
+from utils.preprocessing import processingpipeline,paraLengthCheck
+import streamlit as st
+import logging
+import pandas as pd
+import nltk
+nltk.download('punkt_tab')
+params  = get_classifier_params("preprocessing")
+@st.cache_data
+def runPreprocessingPipeline(file_name:str, file_path:str,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    Params
+    ------------
+    file_name: filename, in case of streamlit application use
+    st.session_state['filename']
+    file_path: filepath, in case of streamlit application use st.session_state['filepath']
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+                    it defines the length of paragraph.
+    split_respect_sentence_boundary: Used when using 'word' strategy for
+    splititng of text.
+    split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+    remove_punc: to remove all Punctuation including ',' and '.' or not
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the Haysatck implementation of SDG classification we,
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    processing_pipeline = processingpipeline()
+    output_pre = processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap, \
+        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
+    return output_pre
+def app():
+    with st.container():
+          if 'filepath' in st.session_state:
+              file_name = st.session_state['filename']
+              file_path = st.session_state['filepath']
+              all_documents = runPreprocessingPipeline(file_name= file_name,
+                              file_path= file_path, split_by= params['split_by'],
+                              split_length= params['split_length'],
+              split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
+              split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
+              paralist = paraLengthCheck(all_documents['documents'], 100)
+              df = pd.DataFrame(paralist,columns = ['text','page'])
+              # saving the dataframe to session state
+              st.session_state['key0'] = df
+          else:
+                st.info("🤔 No document found, please try to upload it at the sidebar!")
+                logging.warning("Terminated as no document provided")

appStore/target.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+#import needed libraries
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+from utils.target_classifier import load_targetClassifier, target_classification
+import logging
+logger = logging.getLogger(__name__)
+from utils.config import get_classifier_params
+from utils.preprocessing import paraLengthCheck
+from io import BytesIO
+import xlsxwriter
+import plotly.express as px
+from utils.target_classifier import label_dict
+from appStore.rag import run_query
+# Declare all the necessary variables
+classifier_identifier = 'target'
+params  = get_classifier_params(classifier_identifier)
+@st.cache_data
+def to_excel(df,sectorlist):
+    len_df = len(df)
+    output = BytesIO()
+    writer = pd.ExcelWriter(output, engine='xlsxwriter')
+    df.to_excel(writer, index=False, sheet_name='Sheet1')
+    workbook = writer.book
+    worksheet = writer.sheets['Sheet1']
+    worksheet.data_validation('S2:S{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': ['No', 'Yes', 'Discard']})
+    worksheet.data_validation('X2:X{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('T2:T{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('U2:U{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('V2:V{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('W2:U{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    writer.save()
+    processed_data = output.getvalue()
+    return processed_data
+def app():
+    ### Main app code ###
+    with st.container():
+        if 'key1' in st.session_state:
+            # Load the existing dataset
+            df = st.session_state.key1
+            # Filter out all paragraphs that do not have a reference to groups
+            df = df[df['Vulnerability Label'].apply(lambda x: len(x) > 0 and 'Other' not in x)]
+            # Load the classifier model
+            classifier = load_targetClassifier(classifier_name=params['model_name'])
+            st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+            df = target_classification(haystack_doc=df,
+                                        threshold= params['threshold'])
+            # Rename column
+            df.rename(columns={'Target Label': 'Specific action/target/measure mentioned'}, inplace=True)
+            st.session_state.key2 = df
+def target_display(model_sel_name):
+    ### TABLE Output ###
+    # Assign dataframe a name
+    df = st.session_state['key2']
+    st.write(df)
+    ### RAG Output by group ##
+    # Expand the DataFrame
+    df_expand = (
+        df.query("`Specific action/target/measure mentioned` == 'YES'")
+        .explode('Vulnerability Label')
+        )
+    # Group by 'Vulnerability Label' and concatenate 'text'
+    df_agg = df_expand.groupby('Vulnerability Label')['text'].agg('; '.join).reset_index()
+    # st.write(df_agg)
+    st.markdown("----")
+    st.markdown('**DOCUMENT FINDINGS SUMMARY BY VULNERABILITY LABEL:**')
+    # construct RAG query for each label, send to openai and process response
+    for i in range(0,len(df_agg)):
+        st.write(df_agg['Vulnerability Label'].iloc[i])
+        run_query(context = df_agg['text'].iloc[i], label = df_agg['Vulnerability Label'].iloc[i], model_sel_name=model_sel_name)

appStore/vulnerability_analysis.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+#import needed libraries
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+from utils.vulnerability_classifier import load_vulnerabilityClassifier, vulnerability_classification
+import logging
+logger = logging.getLogger(__name__)
+from utils.config import get_classifier_params
+from utils.preprocessing import paraLengthCheck
+from io import BytesIO
+import xlsxwriter
+import plotly.express as px
+import plotly.graph_objects as go
+from utils.vulnerability_classifier import label_dict
+# Declare all the necessary variables
+classifier_identifier = 'vulnerability'
+params  = get_classifier_params(classifier_identifier)
+@st.cache_data
+def to_excel(df,sectorlist):
+    len_df = len(df)
+    output = BytesIO()
+    writer = pd.ExcelWriter(output, engine='xlsxwriter')
+    df.to_excel(writer, index=False, sheet_name='Sheet1')
+    workbook = writer.book
+    worksheet = writer.sheets['Sheet1']
+    worksheet.data_validation('S2:S{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': ['No', 'Yes', 'Discard']})
+    worksheet.data_validation('X2:X{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('T2:T{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('U2:U{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('V2:V{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('W2:U{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    writer.save()
+    processed_data = output.getvalue()
+    return processed_data
+def app():
+    ### Main app code ###
+    with st.container():
+            # If a document has been processed
+            if 'key0' in st.session_state:
+                # Run vulnerability classifier
+                df = st.session_state.key0
+                classifier = load_vulnerabilityClassifier(classifier_name=params['model_name'])
+                st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+                # Get the predictions
+                df = vulnerability_classification(haystack_doc=df,
+                                            threshold= params['threshold'])
+                # Store df in session state with key1
+                st.session_state.key1 = df
+def vulnerability_display():
+    # Get the vulnerability df
+    df = st.session_state['key1']
+    # Filter the dataframe to only show the paragraphs with references
+    df_filtered = df[df['Vulnerability Label'].apply(lambda x: len(x) > 0 and 'Other' not in x)]
+    # Rename column
+    df_filtered.rename(columns={'Vulnerability Label': 'Group(s)'}, inplace=True)
+    # Header
+    st.subheader("Explore references to vulnerable groups:")
+    # Text
+    num_paragraphs = len(df['Vulnerability Label'])
+    num_references = len(df_filtered['Group(s)'])
+    st.markdown(f"""<div style="text-align: justify;">The document contains a
+            total of <span style="color: red;">{num_paragraphs}</span> paragraphs.
+            We identified <span style="color: red;">{num_references}</span>
+            references to groups in vulnerable situations.</div>
+            <br>
+            <div style="text-align: justify;">We are searching for references related
+            to the following groups: (1) Agricultural communities, (2) Children, (3)
+            Ethnic, racial and other minorities, (4) Fishery communities, (5) Informal sector
+            workers, (6) Members of indigenous and local communities, (7) Migrants and
+            displaced persons, (8) Older persons, (9) Persons living in poverty, (10)
+            Persons living with disabilities, (11) Persons with pre-existing health conditions,
+            (12) Residents of drought-prone regions, (13) Rural populations, (14) Sexual
+            minorities (LGBTQI+), (15) Urban populations, (16) Women and other genders.</div>
+            <br>
+            <div style="text-align: justify;">The chart below shows the groups for which
+            references were found and the number of references identified.
+            For a more detailed view in the text, see the paragraphs and
+            their respective labels in the table underneath.</div>""", unsafe_allow_html=True)
+    ### Bar chart
+    # # Create a df that stores all the labels
+    df_labels = pd.DataFrame(list(label_dict.items()), columns=['Label ID', 'Label'])
+    # Count how often each label appears in the "Group identified" column
+    group_counts = {}
+    # Iterate through each sublist
+    for index, row in df_filtered.iterrows():
+        # Iterate through each group in the sublist
+        for sublist in row['Group(s)']:
+            # Update the count in the dictionary
+            group_counts[sublist] = group_counts.get(sublist, 0) + 1
+    # Create a new dataframe from group_counts
+    df_label_count = pd.DataFrame(list(group_counts.items()), columns=['Label', 'Count'])
+    # Merge the label counts with the df_label DataFrame
+    df_label_count = df_labels.merge(df_label_count, on='Label', how='left')
+    # Exclude the "Other" group and all groups that do not have a label
+    df_bar_chart = df_label_count[df_label_count['Label'] != 'Other']
+    df_bar_chart = df_bar_chart.dropna(subset=['Count'])
+    # Bar chart
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        y=df_bar_chart.Label,
+        x=df_bar_chart.Count,
+        orientation='h',
+        marker=dict(color='purple'),
+    ))
+    # Customize layout
+    fig.update_layout(
+        title='Number of references identified',
+        xaxis_title='Number of references',
+        yaxis_title='Group',
+    )
+    # Show the plot
+    #fig.show()
+    #Show plot
+    st.plotly_chart(fig, use_container_width=True)

utils/target_classifier.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from typing import List, Tuple
+from typing_extensions import Literal
+import logging
+import pandas as pd
+from pandas import DataFrame, Series
+from utils.config import getconfig
+from utils.preprocessing import processingpipeline
+import streamlit as st
+from setfit import SetFitModel
+from transformers import pipeline
+## Labels dictionary ###
+label_dict = {
+            0:'NO',
+            1:'YES',
+            }
+def get_target_labels(preds):
+    """
+    Function that takes the numerical predictions as an input and returns a list of the labels.
+    """
+    # Turn into list
+    preds_list = preds.numpy().tolist()
+    # Get label names
+    predictions_names=[]
+    # loop through each prediction
+    for ele in preds_list:
+      # see if there is a value 1 and retrieve index
+      try:
+        index_of_one = ele.index(1)
+      except ValueError:
+        index_of_one = "NA"
+      # Retrieve the name of the label (if no prediction made = NA)
+      if index_of_one != "NA":
+        name  = label_dict[index_of_one]
+      else:
+        name = "Other"
+      # Append name to list
+      predictions_names.append(name)
+    return predictions_names
+@st.cache_resource
+def load_targetClassifier(config_file:str = None, classifier_name:str = None):
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
+    Return: document classifier model
+    """
+    if not classifier_name:
+        if not config_file:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(config_file)
+            classifier_name = config.get('target','MODEL')
+    logging.info("Loading classifier")
+    # Loading classifier
+    doc_classifier = SetFitModel.from_pretrained("leavoigt/vulnerability_target")
+    return doc_classifier
+@st.cache_data
+def target_classification(haystack_doc:pd.DataFrame,
+                        threshold:float = 0.5,
+                        classifier_model:pipeline= None
+                        )->Tuple[DataFrame,Series]:
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. There labels indicate whether the paragraph
+    references a specific action, target or measure in the paragraph.
+    ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
+    contains the list of paragraphs in different format,here the list of
+    Haystack Documents is used.
+    threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
+    Returns
+    ----------
+    df: Dataframe with two columns['SDG:int', 'text']
+    x: Series object with the unique SDG covered in the document uploaded and
+    the number of times it is covered/discussed/count_of_paragraphs.
+    """
+    logging.info("Working on target/action identification")
+    haystack_doc['Target Label'] = 'NA'
+    if not classifier_model:
+        classifier_model = st.session_state['target_classifier']
+        # Get predictions
+        predictions = classifier_model(list(haystack_doc.text))
+        # Get labels for predictions
+        pred_labels = get_target_labels(predictions)
+        # Save labels
+        haystack_doc['Target Label'] = pred_labels
+        return haystack_doc

utils/vulnerability_classifier.py CHANGED Viewed

@@ -1,3 +1,15 @@
 # labels dictionary
 label_dict= {0: 'Agricultural communities',
  1: 'Children',
@@ -16,4 +28,128 @@ label_dict= {0: 'Agricultural communities',
  14: 'Rural populations',
  15: 'Sexual minorities (LGBTQI+)',
  16: 'Urban populations',
- 17: 'Women and other genders'}

+from typing import List, Tuple
+from typing_extensions import Literal
+import logging
+import pandas as pd
+from pandas import DataFrame, Series
+from utils.config import getconfig
+from utils.preprocessing import processingpipeline
+import streamlit as st
+from transformers import pipeline
+from setfit import SetFitModel
 # labels dictionary
 label_dict= {0: 'Agricultural communities',
  1: 'Children',
  14: 'Rural populations',
  15: 'Sexual minorities (LGBTQI+)',
  16: 'Urban populations',
+ 17: 'Women and other genders'}
+def get_vulnerability_labels(preds):
+    """
+    Function that takes the numerical predictions as an input and returns a list of the labels.
+    """
+    # Get label names
+    preds_list = preds.tolist()
+    # Get the name of the group where the prediction is equal to "1"
+    result = []
+    for sublist in preds_list:
+        names = [label_dict[key] for key, value in enumerate(sublist) if value == 1]
+        result.append(names)
+    return result
+@st.cache_resource
+def load_vulnerabilityClassifier(config_file:str = None, classifier_name:str = None):
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
+    Return: document classifier model
+    """
+    # If no classifier given
+    if not classifier_name:
+        if not config_file:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(config_file)
+            classifier_name = config.get('vulnerability','MODEL')
+    logging.info("Loading vulnerability classifier")
+    # we are using the pipeline as the model is multilabel and DocumentClassifier
+    # from Haystack doesnt support multilabel
+    # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
+    # if not then it will automatically use softmax, which is not a desired thing.
+    # doc_classifier = TransformersDocumentClassifier(
+    #                     model_name_or_path=classifier_name,
+    #                     task="text-classification",
+    #                     top_k = None)
+    # Download model from HF Hub
+    doc_classifier = SetFitModel.from_pretrained(classifier_name)
+    # doc_classifier = pipeline("text-classification",
+    #                         model=classifier_name,
+    #                         return_all_scores=True,
+    #                         function_to_apply= "sigmoid")
+    return doc_classifier
+@st.cache_data
+def vulnerability_classification(haystack_doc:pd.DataFrame,
+                        threshold:float = 0.5,
+                        classifier_model:pipeline= None
+                        )->Tuple[DataFrame,Series]:
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. these labels are in terms of if text
+    reference a group in a vulnerable situation.
+    ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
+    contains the list of paragraphs in different format,here the list of
+    Haystack Documents is used.
+    threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
+    Returns
+    ----------
+    df: Dataframe with two columns['SDG:int', 'text']
+    x: Series object with the unique SDG covered in the document uploaded and
+    the number of times it is covered/discussed/count_of_paragraphs.
+    """
+    logging.info("Working on vulnerability Identification")
+    haystack_doc['Vulnerability Label'] = 'NA'
+    # haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
+    # df1 = haystack_doc[haystack_doc['PA_check'] == True]
+    # df = haystack_doc[haystack_doc['PA_check'] == False]
+    if not classifier_model:
+        classifier_model = st.session_state['vulnerability_classifier']
+        predictions = classifier_model(list(haystack_doc.text))
+        pred_labels = get_vulnerability_labels(predictions)
+        haystack_doc['Vulnerability Label'] = pred_labels
+    #   placeholder = {}
+    #   for j in range(len(temp)):
+    #     placeholder[temp[j]['label']] = temp[j]['score']
+    #   list_.append(placeholder)
+    # labels_ = [{**list_[l]} for l in range(len(predictions))]
+    # truth_df = DataFrame.from_dict(labels_)
+    # truth_df = truth_df.round(2)
+    # truth_df = truth_df.astype(float) >= threshold
+    # truth_df = truth_df.astype(str)
+    # categories = list(truth_df.columns)
+    # truth_df['Vulnerability Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
+    #                                           None for i in categories}, axis=1)
+    # truth_df['Vulnerability Label'] = truth_df.apply(lambda x: list(x['Vulnerability Label']
+    #                                                         -{None}),axis=1)
+    # haystack_doc['Vulnerability Label'] = list(truth_df['Vulnerability Label'])
+    return haystack_doc