Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

prashant commited on Oct 21, 2022

Commit

49a314a

•

1 Parent(s): f47e7d4

ver0.2 udfpreprocess update

Browse files

Files changed (6) hide show

udfPreprocess/cleaning.py +16 -4
udfPreprocess/docPreprocessing.py +6 -6
udfPreprocess/paramconfig.cfg +12 -0
udfPreprocess/sdg.py +57 -0
udfPreprocess/search.py +145 -0
udfPreprocess/uploadAndExample.py +48 -0

udfPreprocess/cleaning.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import pandas as pd
 import numpy as np
 import string
@@ -10,7 +11,7 @@ import streamlit as st
 from haystack.nodes import PreProcessor
 '''basic cleaning - suitable for transformer models'''
-def basic(s):
     """
     :param s: string to be processed
     :return: processed string: see comments in the source code for more info
@@ -23,6 +24,15 @@ def basic(s):
     # Remove URLs
     s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
     s = re.sub(r"http\S+", " ", s)
     # Remove new line characters
     #s = re.sub('\n', ' ', s)
@@ -59,9 +69,10 @@ def preprocessingForSDG(document):
     for i in document:
         docs_processed = preprocessor.process([i])
         for item in docs_processed:
-            item.content = basic(item.content)
-    st.write("your document has been splitted to", len(docs_processed), "paragraphs")
     # create dataframe of text and list of all text
     df = pd.DataFrame(docs_processed)
@@ -93,7 +104,8 @@ def preprocessing(document):
         for item in docs_processed:
             item.content = basic(item.content)
-    st.write("your document has been splitted to", len(docs_processed), "paragraphs")
     # create dataframe of text and list of all text
     df = pd.DataFrame(docs_processed)

+import logging
 import pandas as pd
 import numpy as np
 import string
 from haystack.nodes import PreProcessor
 '''basic cleaning - suitable for transformer models'''
+def basic(s,SDG = False):
     """
     :param s: string to be processed
     :return: processed string: see comments in the source code for more info
     # Remove URLs
     s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
     s = re.sub(r"http\S+", " ", s)
+    if SDG == True:
+        s = s.lower()
+        translator = str.maketrans(' ', ' ', string.punctuation)
+        s = s.translate(translator)
+        s = re.sub('\n', ' ', s)
+        s = re.sub("\'", " ", s)
+        s = re.sub(r'\d+', ' ', s)
+        s = re.sub(r'\W+', ' ', s)
     # Remove new line characters
     #s = re.sub('\n', ' ', s)
     for i in document:
         docs_processed = preprocessor.process([i])
         for item in docs_processed:
+            item.content = basic(item.content, SDG = True)
+    with st.spinner("👑 document being splitted into paragraphs"):
+        logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
     # create dataframe of text and list of all text
     df = pd.DataFrame(docs_processed)
         for item in docs_processed:
             item.content = basic(item.content)
+    with st.spinner("👑 document being splitted into paragraphs"):
+        logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
     # create dataframe of text and list of all text
     df = pd.DataFrame(docs_processed)

udfPreprocess/docPreprocessing.py CHANGED Viewed

@@ -65,11 +65,11 @@ def load_document(
     This can happen whith certain pdf types.'''
     for i in documents:
         if i.content == "":
-            st.write("using pdfplumber")
-            text = []
-            with pdfplumber.open(file_path) as pdf:
-                for page in pdf.pages:
-                    text.append(page.extract_text())
-            i.content = ' '.join([page for page in text])
     return documents

     This can happen whith certain pdf types.'''
     for i in documents:
         if i.content == "":
+            with st.spinner("using pdfplumber"):
+                text = []
+                with pdfplumber.open(file_path) as pdf:
+                    for page in pdf.pages:
+                        text.append(page.extract_text())
+                i.content = ' '.join([page for page in text])
     return documents

udfPreprocess/paramconfig.cfg ADDED Viewed

	@@ -0,0 +1,12 @@

+[lexical_search]
+TOP_K = 10
+THRESHOLD = 0.1
+[semantic_search]
+TOP_K = 10
+MAX_SEQ_LENGTH = 64
+MODEL_NAME = msmarco-distilbert-cos-v5
+THRESHOLD = 0.1
+[sdg]
+THRESHOLD = 0.85

udfPreprocess/sdg.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import glob, os, sys;
+sys.path.append('../udfPreprocess')
+#import helper
+import udfPreprocess.docPreprocessing as pre
+import udfPreprocess.cleaning as clean
+#import needed libraries
+import seaborn as sns
+from pandas import DataFrame
+from keybert import KeyBERT
+from transformers import pipeline
+import matplotlib.pyplot as plt
+import numpy as np
+import streamlit as st
+import pandas as pd
+import docx
+from docx.shared import Inches
+from docx.shared import Pt
+from docx.enum.style import WD_STYLE_TYPE
+import tempfile
+import sqlite3
+import logging
+logger = logging.getLogger(__name__)
+import configparser
+@st.cache(allow_output_mutation=True)
+def load_sdgClassifier():
+    classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
+    logging.info("Loading classifier")
+    return classifier
+def sdg_classification(par_list):
+    logging.info("running SDG classifiication")
+    config = configparser.ConfigParser()
+    config.read_file(open('udfPreprocess/paramconfig.cfg'))
+    threshold = float(config.get('sdg','THRESHOLD'))
+    classifier = load_sdgClassifier()
+    labels = classifier(par_list)
+    labels_= [(l['label'],l['score']) for l in labels]
+    # df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
+    df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
+    df2['text'] = par_list
+    df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
+    df2.index += 1
+    df2 =df2[df2['Relevancy']>threshold]
+    x = df2['SDG'].value_counts()
+    df3 = df2.copy()
+    df3= df3.drop(['Relevancy'], axis = 1)
+    return df3, x

udfPreprocess/search.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import glob, os, sys; sys.path.append('../udfPreprocess')
+#import helper
+import udfPreprocess.docPreprocessing as pre
+import udfPreprocess.cleaning as clean
+#import needed libraries
+import seaborn as sns
+from pandas import DataFrame
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+# from keybert import KeyBERT
+from transformers import pipeline
+import matplotlib.pyplot as plt
+import numpy as np
+import streamlit as st
+import pandas as pd
+from rank_bm25 import BM25Okapi
+from sklearn.feature_extraction import _stop_words
+import string
+from tqdm.autonotebook import tqdm
+import numpy as np
+import docx
+from docx.shared import Inches
+from docx.shared import Pt
+from docx.enum.style import WD_STYLE_TYPE
+import logging
+logger = logging.getLogger(__name__)
+import tempfile
+import sqlite3
+import configparser
+### These are lexcial search related functions/methods#####
+def bm25_tokenizer(text):
+    tokenized_doc = []
+    for token in text.lower().split():
+        token = token.strip(string.punctuation)
+        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
+            tokenized_doc.append(token)
+    return tokenized_doc
+def bm25TokenizeDoc(paraList):
+    tokenized_corpus = []
+    ##########Commenting this for now########### will incorporate paragrpah splitting later.
+    # for passage in tqdm(paraList):
+        # if len(passage.split()) >256:
+        #     # st.write("Splitting")
+        #     temp  = " ".join(passage.split()[:256])
+        #     tokenized_corpus.append(bm25_tokenizer(temp))
+        #     temp  = " ".join(passage.split()[256:])
+        #     tokenized_corpus.append(bm25_tokenizer(temp))
+        # else:
+        #     tokenized_corpus.append(bm25_tokenizer(passage))
+    ######################################################################################33333
+    for passage in tqdm(paraList):
+        tokenized_corpus.append(bm25_tokenizer(passage))
+    return tokenized_corpus
+def lexical_search(keyword, document_bm25):
+    config = configparser.ConfigParser()
+    config.read_file(open('udfPreprocess/paramconfig.cfg'))
+    top_k = int(config.get('lexical_search','TOP_K'))
+    bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
+    top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
+    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
+    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
+    return bm25_hits
+@st.cache(allow_output_mutation=True)
+def load_sentenceTransformer(name):
+    return SentenceTransformer(name)
+def semantic_search(keywordlist,paraList):
+    ##### Sematic Search #####
+    #query = "Does document contain {} issues ?".format(keyword)
+    config = configparser.ConfigParser()
+    config.read_file(open('udfPreprocess/paramconfig.cfg'))
+    model_name = config.get('semantic_search','MODEL_NAME')
+    bi_encoder = load_sentenceTransformer(model_name)
+    bi_encoder.max_seq_length = int(config.get('semantic_search','MAX_SEQ_LENGTH'))     #Truncate long passages to 256 tokens
+    top_k = int(config.get('semantic_search','TOP_K'))
+    document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
+    question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
+    hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
+    return hits
+def show_results(keywordList):
+            document = docx.Document()
+            # document.add_heading('Document name:{}'.format(file_name), 2)
+            section = document.sections[0]
+           # Calling the footer
+            footer = section.footer
+            # Calling the paragraph already present in
+        # the footer section
+            footer_para = footer.paragraphs[0]
+            font_styles = document.styles
+            font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
+            font_object = font_charstyle.font
+            font_object.size = Pt(7)
+        # Adding the centered zoned footer
+            footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
+            document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
+            for keyword in keywordList:
+              st.write("Results for Query: {}".format(keyword))
+              para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
+              para.font.size = Pt(12)
+              bm25_hits, hits = search(keyword)
+              st.markdown("""
+                      We will provide with 2 kind of results. The 'lexical search' and the semantic search.
+                      """)
+              # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
+              st.markdown("Top few lexical search (BM25) hits")
+              document.add_paragraph("Top few lexical search (BM25) hits")
+              for hit in bm25_hits[0:5]:
+                  if hit['score'] > 0.00:
+                      st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
+                      document.add_paragraph("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
+        #   st.table(bm25_hits[0:3])
+              st.markdown("\n-------------------------\n")
+              st.markdown("Top few Bi-Encoder Retrieval hits")
+              document.add_paragraph("\n-------------------------\n")
+              document.add_paragraph("Top few Bi-Encoder Retrieval hits")
+              hits = sorted(hits, key=lambda x: x['score'], reverse=True)
+              for hit in hits[0:5]:
+                #  if hit['score'] > 0.45:
+                  st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
+                  document.add_paragraph("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))

udfPreprocess/uploadAndExample.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import streamlit as st
+import tempfile
+import udfPreprocess.docPreprocessing as pre
+import udfPreprocess.cleaning as clean
+def add_upload(choice):
+    if choice == 'Upload Document':
+          uploaded_file = st.sidebar.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
+          if uploaded_file is not None:
+            with tempfile.NamedTemporaryFile(mode="wb") as temp:
+                bytes_data = uploaded_file.getvalue()
+                temp.write(bytes_data)
+                st.session_state['filename'] = uploaded_file.name
+                # st.write("Uploaded Filename: ", uploaded_file.name)
+                file_name =  uploaded_file.name
+                file_path = temp.name
+                docs = pre.load_document(file_path, file_name)
+                haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
+                st.session_state['docs'] = docs
+                st.session_state['paraList'] = paraList
+    else:
+          # listing the options
+          option = st.sidebar.selectbox('Select the example document',
+                                ('South Africa:Low Emission strategy',
+                                'Ethiopia: 10 Year Development Plan'))
+          if option is 'South Africa:Low Emission strategy':
+            file_name = file_path  = 'sample/South Africa_s Low Emission Development Strategy.txt'
+            st.session_state['filename'] = file_name
+            # st.write("Selected document:", file_name.split('/')[1])
+            # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
+            # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
+          else:
+            # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
+            file_name = file_path =  'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
+            st.session_state['filename'] = file_name
+            # st.write("Selected document:", file_name.split('/')[1])
+          if option is not None:
+            docs = pre.load_document(file_path,file_name)
+            haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
+            st.session_state['docs'] = docs
+            st.session_state['paraList'] = paraList