Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

prashant commited on Nov 2, 2022

Commit

cc5c327

•

1 Parent(s): 4a20529

lexical search app update

Browse files

Files changed (6) hide show

app.py +2 -2
appStore/keyword_search.py +18 -107
appStore/sdg_analysis.py +6 -1
paramconfig.cfg +2 -0
utils/search.py +104 -7
ver0.1 scripts/keyword_search.py +169 -0

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# import appStore.keyword_search as keyword_search
 import appStore.sdg_analysis as sdg_analysis
 #import appStore.coherence as coherence
 import appStore.info as info
@@ -12,6 +12,6 @@ app = MultiApp()
 app.add_app("About","house", info.app)
 app.add_app("SDG Analysis","gear",sdg_analysis.app)
-# app.add_app("Search","search", keyword_search.app)
 app.run()

+import appStore.keyword_search as keyword_search
 import appStore.sdg_analysis as sdg_analysis
 #import appStore.coherence as coherence
 import appStore.info as info
 app.add_app("About","house", info.app)
 app.add_app("SDG Analysis","gear",sdg_analysis.app)
+app.add_app("Search","search", keyword_search.app)
 app.run()

appStore/keyword_search.py CHANGED Viewed

@@ -1,38 +1,12 @@
 # set path
-import glob, os, sys
-from udfPreprocess.search import semantic_search
-sys.path.append('../udfPreprocess')
-#import helper
-import udfPreprocess.docPreprocessing as pre
-import udfPreprocess.cleaning as clean
-from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search
-#import needed libraries
-import seaborn as sns
-from pandas import DataFrame
-from sentence_transformers import SentenceTransformer, CrossEncoder, util
-# from keybert import KeyBERT
-from transformers import pipeline
-import matplotlib.pyplot as plt
-import numpy as np
 import streamlit as st
-import pandas as pd
-from rank_bm25 import BM25Okapi
-from sklearn.feature_extraction import _stop_words
-import string
-from tqdm.autonotebook import tqdm
-import numpy as np
-import docx
-from docx.shared import Inches
-from docx.shared import Pt
-from docx.enum.style import WD_STYLE_TYPE
-import logging
-logger = logging.getLogger(__name__)
-import tempfile
-import sqlite3
 import json
-import configparser
 def app():
@@ -54,11 +28,9 @@ def app():
             """)
         st.markdown("")
     with st.sidebar:
-        with open('sample/keywordexample.json','r') as json_file:
             keywordexample = json.load(json_file)
         genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
@@ -76,93 +48,32 @@ def app():
             keywordList = None
         searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
     with st.container():
         if keywordList is not None:
             queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
                                     value="{}".format(keywordList))
         else:
             queryList = st.text_input("Please enter here your question and we will look \
-                                     for an answer in the document OR enter the keyword you \
-                                     are looking for and we will \
-                                     we will look for similar context \
-                                     in the document.",
                                     placeholder="Enter keyword here")
         if st.button("Find them"):
             if queryList == "":
                 st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
                 logging.warning("Terminated as no keyword provided")
             else:
-                if 'docs' in st.session_state:
-                    docs = st.session_state['docs']
-                    paraList = st.session_state['paraList']
                     if searchtype == 'Exact Matches':
-                        queryList = list(queryList.split(","))
                         logging.info("performing lexical search")
-                        tokenized_corpus = bm25TokenizeDoc(paraList)
-                        # st.write(len(tokenized_corpus))
-                        document_bm25 = BM25Okapi(tokenized_corpus)
                         with st.spinner("Performing Exact matching search (Lexical search) for you"):
-                            st.markdown("##### Top few lexical search (BM25) hits #####")
-                            for keyword in queryList:
-                                bm25_hits = lexical_search(keyword,document_bm25)
-                                counter = 0
-                                for hit in bm25_hits:
-                                    if hit['score'] > 0.00:
-                                        counter += 1
-                                        if counter == 1:
-                                            st.markdown("###### Results for keyword: **{}** ######".format(keyword))
-                                        # st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-                                        st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
-                                if counter == 0:
-                                    st.write("No results found for '**{}**' ".format(keyword))
-                                st.markdown("---")
-                    else:
-                        logging.info("starting semantic search")
-                        with st.spinner("Performing Similar/Contextual search"):
-                            query = "Find {} related issues ?".format(queryList)
-                            config = configparser.ConfigParser()
-                            config.read_file(open('udfPreprocess/paramconfig.cfg'))
-                            threshold = float(config.get('semantic_search','THRESHOLD'))
-                            # st.write(query)
-                            semantic_hits = semantic_search(query,paraList)
-                            st.markdown("##### Few Semantic search hits for {} related topics #####".format(queryList))
-                            for i,queryhit in enumerate(semantic_hits):
-                                # st.markdown("###### Results for query: **{}** ######".format(queryList[i]))
-                                counter = 0
-                                for hit in queryhit:
-                                    counter += 1
-                                    if hit['score'] > threshold:
-                                    # st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-                                        st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
-                                    # document.add_paragraph("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
-                                st.markdown("---")
-                            # st.write(semantic_hits)
-                else:
-                    st.info("🤔 No document found, please try to upload it at the sidebar!")
-                    logging.warning("Terminated as no keyword provided")

 # set path
+import glob, os, sys;
+sys.path.append('../utils')
 import streamlit as st
 import json
+import logging
+from utils.search import runLexicalPreprocessingPipeline, tokenize_lexical_query
+from utils.search import runSpacyMatcher, lexical_search
 def app():
             """)
         st.markdown("")
     with st.sidebar:
+        with open('docStore/sample/keywordexample.json','r') as json_file:
             keywordexample = json.load(json_file)
         genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
             keywordList = None
         searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
     with st.container():
         if keywordList is not None:
             queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
                                     value="{}".format(keywordList))
         else:
             queryList = st.text_input("Please enter here your question and we will look \
+                                        for an answer in the document OR enter the keyword you \
+                                        are looking for and we will \
+                                        we will look for similar context \
+                                        in the document.",
                                     placeholder="Enter keyword here")
         if st.button("Find them"):
             if queryList == "":
                 st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
                 logging.warning("Terminated as no keyword provided")
             else:
+                if 'filepath' in st.session_state:
+                    paraList = runLexicalPreprocessingPipeline()
                     if searchtype == 'Exact Matches':
+                        # queryList = list(queryList.split(","))
                         logging.info("performing lexical search")
+                        # token_list = tokenize_lexical_query(queryList)
                         with st.spinner("Performing Exact matching search (Lexical search) for you"):
+                            lexical_search(queryList,paraList)

appStore/sdg_analysis.py CHANGED Viewed

@@ -46,7 +46,12 @@ def app():
         if 'filepath' in st.session_state:
             paraList = runSDGPreprocessingPipeline()
-            with st.spinner("Running SDG"):
                 df, x = sdg_classification(paraList)

         if 'filepath' in st.session_state:
             paraList = runSDGPreprocessingPipeline()
+            if len(paraList) > 150:
+                warning_msg = ": This might take some, please sit back and relax."
+            else:
+                warning_msg = ""
+            with st.spinner("Running SDG Classification{}".format(warning_msg)):
                 df, x = sdg_classification(paraList)

paramconfig.cfg CHANGED Viewed

@@ -1,6 +1,8 @@
 [lexical_search]
 TOP_K = 10
 THRESHOLD = 0.1
 [semantic_search]
 TOP_K = 10

 [lexical_search]
 TOP_K = 10
 THRESHOLD = 0.1
+SPLIT_BY = sentence
+SPLIT_LENGTH = 3
 [semantic_search]
 TOP_K = 10

utils/search.py CHANGED Viewed

@@ -7,17 +7,55 @@ from spacy.matcher import Matcher
 import streamlit as st
 from markdown import markdown
 from annotated_text import annotation
 config = configparser.ConfigParser()
 config.read_file(open('paramconfig.py'))
-def tokenize_lexical_query(query):
     nlp = spacy.load("en_core_web_sm")
     token_list = [token.text.lower() for token in nlp(query) if not token.is_stop]
     return token_list
-def runSpacyMatcher(token_list, document):
     nlp = spacy.load("en_core_web_sm")
     spacydoc = nlp(document)
     matcher = Matcher(nlp.vocab)
@@ -25,20 +63,47 @@ def runSpacyMatcher(token_list, document):
     matcher.add(",".join(token_list), token_pattern)
     spacymatches = matcher(spacydoc)
     matches = []
     for match_id, start, end in spacymatches:
         matches = matches + [[start, end]]
     return matches, spacydoc
-def runRegexMatcher(token_list, document):
     matches = []
     for token in token_list:
         matches = matches + [[val.start(), val.start()+ len(token)] for val in re.finditer(token, document)]
     return matches, document
-def searchAnnotator(matches, document):
     start = 0
     annotated_text = ""
     for match in matches:
@@ -52,10 +117,16 @@ def searchAnnotator(matches, document):
             unsafe_allow_html=True,
         )
-def lexical_search(query,documents):
     document_store = InMemoryDocumentStore()
     document_store.write_documents(documents)
     retriever = TfidfRetriever(document_store)
     results = retriever.retrieve(query=query,
                             top_k= int(config.get('lexical_search','TOP_K')))
@@ -64,5 +135,31 @@ def lexical_search(query,documents):
         matches, doc = runSpacyMatcher(query_tokens,result.content)
         searchAnnotator(matches, doc)

 import streamlit as st
 from markdown import markdown
 from annotated_text import annotation
+from haystack.schema import Document
+from typing import List, Tuple, Text
+from utils.preprocessing import processingpipeline
 config = configparser.ConfigParser()
 config.read_file(open('paramconfig.py'))
+def tokenize_lexical_query(query:str)-> List[str]:
+    """
+    Removes the stop words from query and returns the list of important keywords
+    in query. For the lexical search the relevent paragraphs in document are
+    retreived using TfIDFretreiver from Haystack. However to highlight these
+    keywords we need the tokenized form of query.
+    Params
+    --------
+    query: string which represents either list of keywords user is looking for
+            or a query in form of Question.
+    Return
+    -----------
+    token_list: list of important keywords in the query.
+    """
     nlp = spacy.load("en_core_web_sm")
     token_list = [token.text.lower() for token in nlp(query) if not token.is_stop]
     return token_list
+def runSpacyMatcher(token_list:List[str], document:Text):
+    """
+    Using the spacy in backend finds the keywords in the document using the
+    Matcher class from spacy. We can alternatively use the regex, but spacy
+    finds all keywords in serialized manner which helps in annotation of answers.
+    Params
+    -------
+    token_list: this is token list which tokenize_lexical_query function returns
+    document: text in which we need to find the tokens
+    Return
+    --------
+    matches: List of [start_index, end_index] in the spacydoc(at word level not
+    character) for the keywords in token list.
+    spacydoc: the keyword index in the spacydoc are at word level and not character,
+    therefore to allow the annotator to work seamlessly we return the spacydoc.
+    """
     nlp = spacy.load("en_core_web_sm")
     spacydoc = nlp(document)
     matcher = Matcher(nlp.vocab)
     matcher.add(",".join(token_list), token_pattern)
     spacymatches = matcher(spacydoc)
+    # getting start and end index in spacydoc so that annotator can work seamlessly
     matches = []
     for match_id, start, end in spacymatches:
         matches = matches + [[start, end]]
     return matches, spacydoc
+def runRegexMatcher(token_list:List[str], document:Text):
+    """
+    Using the regex in backend finds the keywords in the document.
+    Params
+    -------
+    token_list: this is token list which tokenize_lexical_query function returns
+    document: text in which we need to find the tokens
+    Return
+    --------
+    matches: List of [start_index, end_index] in the document for the keywords
+    in token list at character level.
+    document: the keyword index returned by regex are at character level,
+    therefore to allow the annotator to work seamlessly we return the text back.
+    """
     matches = []
     for token in token_list:
         matches = matches + [[val.start(), val.start()+ len(token)] for val in re.finditer(token, document)]
     return matches, document
+def searchAnnotator(matches: List[List[int]], document):
+    """
+    Annotates the text in the document defined by list of [start index, end index]
+    Example: "How are you today", if document type is text, matches = [[0,3]]
+    will give answer = "How", however in case we used the spacy matcher then the
+    matches = [[0,3]] will give answer = "How are you". However if spacy is used
+    to find "How" then the matches = [[0,1]] for the string defined above.
+    """
     start = 0
     annotated_text = ""
     for match in matches:
             unsafe_allow_html=True,
         )
+def lexical_search(query:Text,documents:List[Document]):
+    """
+    Performs the Lexical search on the List of haystack documents which is
+    returned by preprocessing Pipeline.
+    """
     document_store = InMemoryDocumentStore()
     document_store.write_documents(documents)
+    # Haystack Retriever works with document stores only.
     retriever = TfidfRetriever(document_store)
     results = retriever.retrieve(query=query,
                             top_k= int(config.get('lexical_search','TOP_K')))
         matches, doc = runSpacyMatcher(query_tokens,result.content)
         searchAnnotator(matches, doc)
+def runLexicalPreprocessingPipeline()->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the Haysatck implementation of SDG classification we,
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    file_path = st.session_state['filepath']
+    file_name = st.session_state['filename']
+    sdg_processing_pipeline = processingpipeline()
+    split_by = config.get('lexical_search','SPLIT_BY')
+    split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
+    output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                        "UdfPreProcessor": {"removePunc": False, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length}})
+    return output_lexical_pre['documents']

ver0.1 scripts/keyword_search.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# set path
+import glob, os, sys
+from udfPreprocess.search import semantic_search
+sys.path.append('../udfPreprocess')
+#import helper
+import udfPreprocess.docPreprocessing as pre
+import udfPreprocess.cleaning as clean
+from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search
+#import needed libraries
+import seaborn as sns
+from pandas import DataFrame
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+# from keybert import KeyBERT
+from transformers import pipeline
+import matplotlib.pyplot as plt
+import numpy as np
+import streamlit as st
+import pandas as pd
+from rank_bm25 import BM25Okapi
+from sklearn.feature_extraction import _stop_words
+import string
+from tqdm.autonotebook import tqdm
+import numpy as np
+import docx
+from docx.shared import Inches
+from docx.shared import Pt
+from docx.enum.style import WD_STYLE_TYPE
+import logging
+logger = logging.getLogger(__name__)
+import tempfile
+import sqlite3
+import json
+import configparser
+def app():
+    with st.container():
+        st.markdown("<h1 style='text-align: center;  \
+                      color: black;'> Search</h1>",
+                      unsafe_allow_html=True)
+        st.write(' ')
+        st.write(' ')
+    with st.expander("ℹ️ - About this app", expanded=False):
+        st.write(
+            """
+            The *Keyword Search* app is an easy-to-use interface \
+            built in Streamlit for doing keyword search in \
+            policy document - developed by GIZ Data and the \
+            Sustainable Development Solution Network.
+            """)
+        st.markdown("")
+    with st.sidebar:
+        with open('sample/keywordexample.json','r') as json_file:
+            keywordexample = json.load(json_file)
+        genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
+        if genre == 'Food':
+            keywordList = keywordexample['Food']
+        elif genre == 'Climate':
+            keywordList = keywordexample['Climate']
+        elif genre == 'Social':
+            keywordList = keywordexample['Social']
+        elif genre == 'Nature':
+            keywordList = keywordexample['Nature']
+        elif genre == 'Implementation':
+            keywordList = keywordexample['Implementation']
+        else:
+            keywordList = None
+        searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
+    with st.container():
+        if keywordList is not None:
+            queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
+                                    value="{}".format(keywordList))
+        else:
+            queryList = st.text_input("Please enter here your question and we will look \
+                                     for an answer in the document OR enter the keyword you \
+                                     are looking for and we will \
+                                     we will look for similar context \
+                                     in the document.",
+                                    placeholder="Enter keyword here")
+        if st.button("Find them"):
+            if queryList == "":
+                st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
+                logging.warning("Terminated as no keyword provided")
+            else:
+                if 'docs' in st.session_state:
+                    docs = st.session_state['docs']
+                    paraList = st.session_state['paraList']
+                    if searchtype == 'Exact Matches':
+                        queryList = list(queryList.split(","))
+                        logging.info("performing lexical search")
+                        tokenized_corpus = bm25TokenizeDoc(paraList)
+                        # st.write(len(tokenized_corpus))
+                        document_bm25 = BM25Okapi(tokenized_corpus)
+                        with st.spinner("Performing Exact matching search (Lexical search) for you"):
+                            st.markdown("##### Top few lexical search (BM25) hits #####")
+                            for keyword in queryList:
+                                bm25_hits = lexical_search(keyword,document_bm25)
+                                counter = 0
+                                for hit in bm25_hits:
+                                    if hit['score'] > 0.00:
+                                        counter += 1
+                                        if counter == 1:
+                                            st.markdown("###### Results for keyword: **{}** ######".format(keyword))
+                                        # st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
+                                        st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
+                                if counter == 0:
+                                    st.write("No results found for '**{}**' ".format(keyword))
+                                st.markdown("---")
+                    else:
+                        logging.info("starting semantic search")
+                        with st.spinner("Performing Similar/Contextual search"):
+                            query = "Find {} related issues ?".format(queryList)
+                            config = configparser.ConfigParser()
+                            config.read_file(open('udfPreprocess/paramconfig.cfg'))
+                            threshold = float(config.get('semantic_search','THRESHOLD'))
+                            # st.write(query)
+                            semantic_hits = semantic_search(query,paraList)
+                            st.markdown("##### Few Semantic search hits for {} related topics #####".format(queryList))
+                            for i,queryhit in enumerate(semantic_hits):
+                                # st.markdown("###### Results for query: **{}** ######".format(queryList[i]))
+                                counter = 0
+                                for hit in queryhit:
+                                    counter += 1
+                                    if hit['score'] > threshold:
+                                    # st.write("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
+                                        st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
+                                    # document.add_paragraph("\t Score: {:.3f}:  \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
+                                st.markdown("---")
+                            # st.write(semantic_hits)
+                else:
+                    st.info("🤔 No document found, please try to upload it at the sidebar!")
+                    logging.warning("Terminated as no keyword provided")