Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

prashant commited on Nov 8, 2022

Commit

f9949bb

•

1 Parent(s): fb4cce0

lexcial update

Browse files

Files changed (4) hide show

appStore/keyword_search.py +21 -19
utils/lexical_search.py +88 -44
utils/sdg_classifier.py +2 -2
utils/streamlitcheck.py +19 -0

appStore/keyword_search.py CHANGED Viewed

@@ -47,12 +47,9 @@ def app():
         else:
             keywordList = None
-        searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context",
                                  ['Exact Matches', 'Similar context/meaning'])
-        # if searchtype == 'Similar context/meaning':
-        #     show_answers = st.sidebar.checkbox("Show context")
     with st.container():
@@ -61,33 +58,38 @@ def app():
                         will look for these keywords in document".format(genre),
                                     value="{}".format(keywordList))
         else:
-            queryList = st.text_input("Please enter here your question and we will look \
-                                        for an answer in the document OR enter the keyword you \
-                                        are looking for and we will \
-                                        we will look for similar context \
-                                        in the document.",
                                     placeholder="Enter keyword here")
         if st.button("Find them"):
             if queryList == "":
-                st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
                 logging.warning("Terminated as no keyword provided")
             else:
                 if 'filepath' in st.session_state:
                     if searchtype == 'Exact Matches':
-                        paraList = runLexicalPreprocessingPipeline()
                         logging.info("performing lexical search")
-                        with st.spinner("Performing Exact matching search (Lexical search) for you"):
                             st.markdown("##### Top few lexical search (TFIDF) hits #####")
-                            lexical_search(queryList,paraList)
                     else:
-                        paraList = runSemanticPreprocessingPipeline()
-                        logging.info("starting semantic search")
-                        with st.spinner("Performing Similar/Contextual search"):
-                            semantic_search(queryList,paraList)
                 else:
                     st.info("🤔 No document found, please try to upload it at the sidebar!")

         else:
             keywordList = None
+        searchtype = st.selectbox("Do you want to find exact macthes or similar \
+                                    meaning/context",
                                  ['Exact Matches', 'Similar context/meaning'])
     with st.container():
                         will look for these keywords in document".format(genre),
                                     value="{}".format(keywordList))
         else:
+            queryList = st.text_input("Please enter here your question and we \
+                                        will look for an answer in the document\
+                                        OR enter the keyword you are looking \
+                                        for and we will we will look for similar\
+                                        context in the document.",
                                     placeholder="Enter keyword here")
         if st.button("Find them"):
             if queryList == "":
+                st.info("🤔 No keyword provided, if you dont have any, \
+                                please try example sets from sidebar!")
                 logging.warning("Terminated as no keyword provided")
             else:
                 if 'filepath' in st.session_state:
                     if searchtype == 'Exact Matches':
+                        allDocuments = runLexicalPreprocessingPipeline(
+                                            st.session_state['filepath'],
+                                            st.session_state['filename'])
                         logging.info("performing lexical search")
+                        with st.spinner("Performing Exact matching search \
+                                        (Lexical search) for you"):
                             st.markdown("##### Top few lexical search (TFIDF) hits #####")
+                            lexical_search(queryList,allDocuments['documents'])
                     else:
+                        pass
+                        # paraList = runSemanticPreprocessingPipeline()
+                        # logging.info("starting semantic search")
+                        # with st.spinner("Performing Similar/Contextual search"):
+                        #     semantic_search(queryList,paraList)
                 else:
                     st.info("🤔 No document found, please try to upload it at the sidebar!")

utils/lexical_search.py CHANGED Viewed

@@ -1,20 +1,67 @@
 from haystack.nodes import TfidfRetriever
-from haystack.nodes.base import BaseComponent
 from haystack.document_stores import InMemoryDocumentStore
-import configparser
 import spacy
 import re
 from spacy.matcher import Matcher
 import streamlit as st
 from markdown import markdown
 from annotated_text import annotation
 from haystack.schema import Document
 from typing import List, Text
 from utils.preprocessing import processingpipeline
-from haystack.pipelines import Pipeline
 config = configparser.ConfigParser()
-config.read_file(open('paramconfig.cfg'))
 def tokenize_lexical_query(query:str)-> List[str]:
@@ -100,61 +147,56 @@ def runRegexMatcher(token_list:List[str], document:Text):
     return matches, document
-def lexicalsearchAnnotator(matches: List[List[int]], document):
     """
     Annotates the text in the document defined by list of [start index, end index]
     Example: "How are you today", if document type is text, matches = [[0,3]]
     will give answer = "How", however in case we used the spacy matcher then the
     matches = [[0,3]] will give answer = "How are you". However if spacy is used
     to find "How" then the matches = [[0,1]] for the string defined above.
     """
     start = 0
     annotated_text = ""
     for match in matches:
         start_idx = match[0]
         end_idx = match[1]
-        annotated_text = (annotated_text + document[start:start_idx].text
-                          + str(annotation(body=document[start_idx:end_idx].text,
-                         label="ANSWER", background="#964448", color='#ffffff')))
         start = end_idx
     annotated_text = annotated_text + document[end_idx:].text
-    st.write(
-            markdown(annotated_text),
-            unsafe_allow_html=True,
-        )
-def runLexicalPreprocessingPipeline()->List[Document]:
-    """
-    creates the pipeline and runs the preprocessing pipeline,
-    the params for pipeline are fetched from paramconfig
-    Return
-    --------------
-    List[Document]: When preprocessing pipeline is run, the output dictionary
-    has four objects. For the lexicaal search using TFIDFRetriever we
-    need to use the List of Haystack Document, which can be fetched by
-    key = 'documents' on output.
-    """
-    file_path = st.session_state['filepath']
-    file_name = st.session_state['filename']
-    lexical_processing_pipeline = processingpipeline()
-    split_by = config.get('lexical_search','SPLIT_BY')
-    split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
-    split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
-    output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
-                            params= {"FileConverter": {"file_path": file_path, \
-                                        "file_name": file_name},
-                                        "UdfPreProcessor": {"removePunc": False, \
-                                            "split_by": split_by, \
-                                            "split_length":split_length,\
-                                            "split_overlap": split_overlap}})
-    return output_lexical_pre['documents']
 def lexical_search(query:Text,documents:List[Document]):
     """
@@ -164,7 +206,7 @@ def lexical_search(query:Text,documents:List[Document]):
     Params
     -------
     query: Keywords that need to be searche in documents.
-    documents: List fo Haystack documents returned by preprocessing pipeline.
     """
@@ -177,9 +219,11 @@ def lexical_search(query:Text,documents:List[Document]):
                             top_k= int(config.get('lexical_search','TOP_K')))
     query_tokens = tokenize_lexical_query(query)
     for count, result in enumerate(results):
-        # if result.content != "":
         matches, doc = runSpacyMatcher(query_tokens,result.content)
         if len(matches) != 0:
-            st.write("Result {}".format(count+1))
-            lexicalsearchAnnotator(matches, doc)

 from haystack.nodes import TfidfRetriever
 from haystack.document_stores import InMemoryDocumentStore
 import spacy
 import re
 from spacy.matcher import Matcher
+from termcolor import colored
 import streamlit as st
 from markdown import markdown
 from annotated_text import annotation
 from haystack.schema import Document
 from typing import List, Text
 from utils.preprocessing import processingpipeline
+from utils.streamlitcheck import check_streamlit
+import configparser
+import logging
+try:
+    import streamlit as st
+except ImportError:
+    logging.info("Streamlit not installed")
 config = configparser.ConfigParser()
+try:
+    config.read_file(open('paramconfig.cfg'))
+except Exception:
+    logging.info("paramconfig file not found")
+    st.info("Please place the paramconfig file in the same directory as app.py")
+def runLexicalPreprocessingPipeline(file_path, file_name)->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    Params
+    ------------
+    file_name: filename, in case of streamlit application use
+    st.session_state['filename']
+    file_path: filepath, in case of streamlit application use
+    st.session_state['filepath']
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the lexicaal search using TFIDFRetriever we
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    lexical_processing_pipeline = processingpipeline()
+    split_by = config.get('lexical_search','SPLIT_BY')
+    split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
+    split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
+    output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                        "UdfPreProcessor": {"removePunc": False, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap}})
+    return output_lexical_pre
 def tokenize_lexical_query(query:str)-> List[str]:
     return matches, document
+def spacyAnnotator(matches: List[List[int]], document:spacy.token.doc.Doc):
     """
+    This is spacy Annotator and needs spacy.doc
     Annotates the text in the document defined by list of [start index, end index]
     Example: "How are you today", if document type is text, matches = [[0,3]]
     will give answer = "How", however in case we used the spacy matcher then the
     matches = [[0,3]] will give answer = "How are you". However if spacy is used
     to find "How" then the matches = [[0,1]] for the string defined above.
+    Params
+    -----------
+    matches: As mentioned its list of list. Example [[0,1],[10,13]]
+    document: document which needs to be indexed.
+    Return
+    --------
+    will send the output to either app front end using streamlit or
+    write directly to output screen.
     """
     start = 0
     annotated_text = ""
     for match in matches:
         start_idx = match[0]
         end_idx = match[1]
+        if check_streamlit():
+            annotated_text = (annotated_text + document[start:start_idx].text
+                            + str(annotation(body=document[start_idx:end_idx].text,
+                            label="ANSWER", background="#964448", color='#ffffff')))
+        else:
+            annotated_text = (annotated_text + document[start:start_idx].text
+                            + colored(document[start_idx:end_idx].text,
+                          "green", attrs = ['bold']))
         start = end_idx
     annotated_text = annotated_text + document[end_idx:].text
+    if check_streamlit():
+        st.write(
+                markdown(annotated_text),
+                unsafe_allow_html=True,
+            )
+    else:
+        print(annotated_text)
 def lexical_search(query:Text,documents:List[Document]):
     """
     Params
     -------
     query: Keywords that need to be searche in documents.
+    documents: List of Haystack documents returned by preprocessing pipeline.
     """
                             top_k= int(config.get('lexical_search','TOP_K')))
     query_tokens = tokenize_lexical_query(query)
     for count, result in enumerate(results):
         matches, doc = runSpacyMatcher(query_tokens,result.content)
         if len(matches) != 0:
+            if check_streamlit():
+                st.write("Result {}".format(count+1))
+            else:
+                print("Results {}".format(count +1))
+            spacyAnnotator(matches, doc)

utils/sdg_classifier.py CHANGED Viewed

@@ -56,7 +56,7 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
     the number of times it is covered/discussed/count_of_paragraphs.
     """
-    logging.info("running SDG classifiication")
     threshold = float(config.get('sdg','THRESHOLD'))
@@ -83,7 +83,7 @@ def runSDGPreprocessingPipeline(file_path, file_name)->List[Document]:
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
-    Param
     ------------
     file_name: filename, in case of streamlit application use

     the number of times it is covered/discussed/count_of_paragraphs.
     """
+    logging.info("Working on SDG Classification")
     threshold = float(config.get('sdg','THRESHOLD'))
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
+    Params
     ------------
     file_name: filename, in case of streamlit application use

utils/streamlitcheck.py ADDED Viewed

	@@ -0,0 +1,19 @@

+def check_streamlit():
+    """
+    Function to check whether python code is run within streamlit
+    Returns
+    -------
+    use_streamlit : boolean
+        True if code is run within streamlit, else False
+    """
+    try:
+        from streamlit.scriptrunner.script_run_context import get_script_run_ctx
+        if not get_script_run_ctx():
+            use_streamlit = False
+        else:
+            use_streamlit = True
+    except ModuleNotFoundError:
+        use_streamlit = False
+    return use_streamlit