Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

prashant commited on Nov 1, 2022

Commit

4a6159c

•

1 Parent(s): 1e18f9c

haystack SDG classification

Browse files

Files changed (7) hide show

app.py +1 -1
appStore/sdg_analysis.py +13 -13
requirements.txt +1 -1
udfPreprocess/paramconfig.cfg +4 -1
udfPreprocess/preprocessing.py +26 -5
udfPreprocess/sdg_classifier.py +89 -0
udfPreprocess/uploadAndExample.py +13 -9

app.py CHANGED Viewed

@@ -12,6 +12,6 @@ app = MultiApp()
 app.add_app("About","house", info.app)
 app.add_app("SDG Analysis","gear",sdg_analysis.app)
-app.add_app("Search","search", keyword_search.app)
 app.run()

 app.add_app("About","house", info.app)
 app.add_app("SDG Analysis","gear",sdg_analysis.app)
+# app.add_app("Search","search", keyword_search.app)
 app.run()

appStore/sdg_analysis.py CHANGED Viewed

@@ -19,8 +19,9 @@ import docx
 from docx.shared import Inches
 from docx.shared import Pt
 from docx.enum.style import WD_STYLE_TYPE
-from udfPreprocess.sdg import sdg_classification
 import tempfile
 import sqlite3
 import logging
@@ -28,14 +29,14 @@ logger = logging.getLogger(__name__)
-@st.cache(allow_output_mutation=True)
-def load_keyBert():
-    return KeyBERT()
-@st.cache(allow_output_mutation=True)
-def load_sdgClassifier():
-    classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
-    return classifier
@@ -59,12 +60,11 @@ def app():
-        if 'docs' in st.session_state:
-            docs = st.session_state['docs']
-            docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
             with st.spinner("Running SDG"):
-                df, x = sdg_classification(par_list)
                 # classifier = load_sdgClassifier()

 from docx.shared import Inches
 from docx.shared import Pt
 from docx.enum.style import WD_STYLE_TYPE
+from udfPreprocess.sdg_classifier import sdg_classification
+from udfPreprocess.sdg_classifier import runSDGPreprocessingPipeline
+import configparser
 import tempfile
 import sqlite3
 import logging
+# @st.cache(allow_output_mutation=True)
+# def load_keyBert():
+#     return KeyBERT()
+# @st.cache(allow_output_mutation=True)
+# def load_sdgClassifier():
+#     classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
+#     return classifier
+        if 'filepath' in st.session_state:
+            paraList = runSDGPreprocessingPipeline()
             with st.spinner("Running SDG"):
+                df, x = sdg_classification(paraList)
                 # classifier = load_sdgClassifier()

requirements.txt CHANGED Viewed

@@ -10,7 +10,7 @@ pandas==1.4.0
 pdfplumber==0.6.2
 Pillow==9.1.1
 seaborn==0.11.2
-transformers==4.13.0
 rank_bm25
 python-docx
 streamlit_option_menu

 pdfplumber==0.6.2
 Pillow==9.1.1
 seaborn==0.11.2
+transformers==4.21.2
 rank_bm25
 python-docx
 streamlit_option_menu

udfPreprocess/paramconfig.cfg CHANGED Viewed

@@ -10,7 +10,10 @@ THRESHOLD = 0.1
 [sdg]
 THRESHOLD = 0.85
 [preprocessor]
-SPLIT_OVERLAP_WORD = 20
 SPLIT_OVERLAP_SENTENCE = 1

 [sdg]
 THRESHOLD = 0.85
+MODEL = 'jonas/sdg_classifier_osdg'
+SPLIT_BY = 'word'
+SPLIT_LENGTH = 110
 [preprocessor]
+SPLIT_OVERLAP_WORD = 10
 SPLIT_OVERLAP_SENTENCE = 1

udfPreprocess/preprocessing.py CHANGED Viewed

@@ -8,6 +8,7 @@ import pandas as pd
 import logging
 import re
 import string
 import configparser
 config = configparser.ConfigParser()
 config.read_file(open('udfPreprocess/paramconfig.cfg'))
@@ -127,6 +128,8 @@ class FileConverter(BaseComponent):
 def basic(s, removePunc:bool = False):
     """
     Params
     ----------
     s: string to be processed
@@ -148,7 +151,7 @@ def basic(s, removePunc:bool = False):
       s = s.translate(translator)
     # Remove distracting single quotes and dotted pattern
     s = re.sub("\'", " ", s)
-    s = re.sub("..","",s)
     return s.strip()
@@ -165,8 +168,8 @@ class UdfPreProcessor(BaseComponent):
     """
     outgoing_edges = 1
-    split_overlap_word = config.get('preprocessor','SPLIT_OVERLAP_WORD')
-    split_overlap_sentence = config.get('preprocessor','SPLIT_OVERLAP_SENTENCE')
     def run(self, documents:List[Document], removePunc:bool,
             split_by: Literal["sentence", "word"] = 'sentence',
@@ -210,6 +213,8 @@ class UdfPreProcessor(BaseComponent):
             split_length=split_length,
             split_respect_sentence_boundary= split_respect_sentence_boundary,
             split_overlap=split_overlap,
             add_page_number=True
             )
@@ -221,7 +226,7 @@ class UdfPreProcessor(BaseComponent):
         df = pd.DataFrame(docs_processed)
         all_text = " ".join(df.content.to_list())
         para_list = df.content.to_list()
         output = {'documents': docs_processed,
                   'dataframe': df,
                   'text': all_text,
@@ -234,4 +239,20 @@ class UdfPreProcessor(BaseComponent):
             therefore nothing here, however to use the custom node we need to have
             this method for the class.
         """
-        return

 import logging
 import re
 import string
+from haystack.pipelines import Pipeline
 import configparser
 config = configparser.ConfigParser()
 config.read_file(open('udfPreprocess/paramconfig.cfg'))
 def basic(s, removePunc:bool = False):
     """
+    Performs basic cleaning of text.
     Params
     ----------
     s: string to be processed
       s = s.translate(translator)
     # Remove distracting single quotes and dotted pattern
     s = re.sub("\'", " ", s)
+    s = s.replace("..","")
     return s.strip()
     """
     outgoing_edges = 1
+    split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
+    split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
     def run(self, documents:List[Document], removePunc:bool,
             split_by: Literal["sentence", "word"] = 'sentence',
             split_length=split_length,
             split_respect_sentence_boundary= split_respect_sentence_boundary,
             split_overlap=split_overlap,
+            # will add page number only in case of PDF not for text/docx file.
             add_page_number=True
             )
         df = pd.DataFrame(docs_processed)
         all_text = " ".join(df.content.to_list())
         para_list = df.content.to_list()
+        logging.info('document split into {} paragraphs'.format(len(para_list)))
         output = {'documents': docs_processed,
                   'dataframe': df,
                   'text': all_text,
             therefore nothing here, however to use the custom node we need to have
             this method for the class.
         """
+        return
+def processingpipeline():
+    """
+    Returns the preprocessing pipeline
+    """
+    preprocessing_pipeline = Pipeline()
+    fileconverter = FileConverter()
+    customPreprocessor = UdfPreProcessor()
+    preprocessing_pipeline.add_node(component=fileconverter, name="FileConverter", inputs=["File"])
+    preprocessing_pipeline.add_node(component = customPreprocessor, name ='UdfPreprocessor', inputs=["FileConverter"])
+    return preprocessing_pipeline

udfPreprocess/sdg_classifier.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from tkinter import Text
+from haystack.nodes import TransformersDocumentClassifier
+from typing import List, Tuple
+import configparser
+import streamlit as st
+from pandas import DataFrame, Series
+import logging
+from udfPreprocess.preprocessing import processingpipeline
+config = configparser.ConfigParser()
+config.read_file(open('udfPreprocess/paramconfig.cfg'))
+@st.cache(allow_output_mutation=True)
+def load_sdgClassifier():
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.
+     1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+     2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Return: document classifier model
+    """
+    logging.info("Loading classifier")
+    doc_classifier_model = config.get('sdg','MODEL')
+    doc_classifier = TransformersDocumentClassifier(
+        model_name_or_path=doc_classifier_model,
+        task="text-classification")
+    return doc_classifier
+def sdg_classification(paraList:List[Text])->Tuple(DataFrame,Series):
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. these labels are in terms of if text
+    belongs to which particular Sustainable Devleopment Goal (SDG).
+    Params
+    ---------
+    paraList: List of paragrpahs/text. The output of Preprocessing Pipeline
+    contains this list of paragraphs in different format, the simple List format
+    is being used here.
+    Returns
+    ----------
+    df: Dataframe with two columns['SDG:int', 'text']
+    x: Series object with the unique SDG covered in the document uploaded and
+    the number of times it is covered/discussed/count_of_paragraphs.
+    """
+    logging.info("running SDG classifiication")
+    threshold = float(config.get('sdg','THRESHOLD'))
+    classifier = load_sdgClassifier()
+    labels = classifier(paraList)
+    labels_= [(l['label'],l['score']) for l in labels]
+    df = DataFrame(labels_, columns=["SDG", "Relevancy"])
+    df['text'] = paraList
+    df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
+    df.index += 1
+    df =df[df['Relevancy']>threshold]
+    x = df['SDG'].value_counts()
+    #  df = df.copy()
+    df= df.drop(['Relevancy'], axis = 1)
+    return df, x
+def runSDGPreprocessingPipeline()->List[Text]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    """
+    file_path = st.session_state['filepath']
+    file_name = st.session_state['filename']
+    sdg_processing_pipeline = processingpipeline()
+    split_by = config.get('sdg','SPLIT_BY')
+    split_length = int(config.get('sdg','SPLIT_LENGTH'))
+    output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                     "UdfPreProcessor": {"removePunc": False, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length}})
+    return output_sdg_pre['paraList']

udfPreprocess/uploadAndExample.py CHANGED Viewed

@@ -16,10 +16,12 @@ def add_upload(choice):
                 # st.write("Uploaded Filename: ", uploaded_file.name)
                 file_name =  uploaded_file.name
                 file_path = temp.name
-                docs = pre.load_document(file_path, file_name)
-                haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
-                st.session_state['docs'] = docs
-                st.session_state['paraList'] = paraList
     else:
@@ -30,6 +32,7 @@ def add_upload(choice):
           if option is 'South Africa:Low Emission strategy':
             file_name = file_path  = 'sample/South Africa_s Low Emission Development Strategy.txt'
             st.session_state['filename'] = file_name
             # st.write("Selected document:", file_name.split('/')[1])
             # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
             # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
@@ -37,12 +40,13 @@ def add_upload(choice):
             # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
             file_name = file_path =  'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
             st.session_state['filename'] = file_name
             # st.write("Selected document:", file_name.split('/')[1])
-          if option is not None:
-            docs = pre.load_document(file_path,file_name)
-            haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
-            st.session_state['docs'] = docs
-            st.session_state['paraList'] = paraList

                 # st.write("Uploaded Filename: ", uploaded_file.name)
                 file_name =  uploaded_file.name
                 file_path = temp.name
+                # docs = pre.load_document(file_path, file_name)
+                # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
+                st.session_state['filename'] = file_name
+                # st.session_state['paraList'] = paraList
+                st.session_state['filepath'] = file_path
     else:
           if option is 'South Africa:Low Emission strategy':
             file_name = file_path  = 'sample/South Africa_s Low Emission Development Strategy.txt'
             st.session_state['filename'] = file_name
+            st.sesion_state['filepath'] = file_path
             # st.write("Selected document:", file_name.split('/')[1])
             # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
             # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
             # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
             file_name = file_path =  'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
             st.session_state['filename'] = file_name
+            st.sesion_state['filepath'] = file_path
             # st.write("Selected document:", file_name.split('/')[1])
+          # if option is not None:
+          #   docs = pre.load_document(file_path,file_name)
+          #   haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
+          #   st.session_state['docs'] = docs
+          #   st.session_state['paraList'] = paraList