Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

prashant commited on Nov 17, 2022

Commit

9f55059

•

1 Parent(s): 949b596

refactoring semantic pep edits in other

Browse files

Files changed (8) hide show

appStore/multiapp.py +1 -1
appStore/sdg_analysis.py +6 -7
utils/checkconfig.py +5 -2
utils/keyword_extraction.py +2 -2
utils/lexical_search.py +11 -7
utils/preprocessing.py +5 -5
utils/sdg_classifier.py +22 -17
utils/semantic_search.py +311 -90

appStore/multiapp.py CHANGED Viewed

@@ -46,7 +46,7 @@ class MultiApp:
         st.sidebar.write(format_func=lambda app: app['title'])
         image = Image.open('docStore/img/giz_sdsn_small.jpg')
-        st.sidebar.image(image, width =150)
         with st.sidebar:
             selected = option_menu(None, [page["title"] for page in self.apps],

         st.sidebar.write(format_func=lambda app: app['title'])
         image = Image.open('docStore/img/giz_sdsn_small.jpg')
+        st.sidebar.image(image, width =200)
         with st.sidebar:
             selected = option_menu(None, [page["title"] for page in self.apps],

appStore/sdg_analysis.py CHANGED Viewed

@@ -93,12 +93,11 @@ def app():
                 file_path = st.session_state['filepath']
                 classifier = load_sdgClassifier(classifier_name=model_name)
                 st.session_state['sdg_classifier'] = classifier
-                all_documents = runSDGPreprocessingPipeline(fileName= file_name,
-                                        filePath= file_path, split_by= split_by,
                                         split_length= split_length,
-                                        split_overlap= split_overlap,
                 split_respect_sentence_boundary= split_respect_sentence_boundary,
-                remove_punc= remove_punc)
                 if len(all_documents['documents']) > 100:
                     warning_msg = ": This might take sometime, please sit back and relax."
@@ -110,14 +109,14 @@ def app():
                     df, x = sdg_classification(haystack_doc=all_documents['documents'],
                                                 threshold= threshold)
                     df = df.drop(['Relevancy'], axis = 1)
-                    sdg_labels = x.SDG.unique()[::-1]
                     textrank_keyword_list = []
                     for label in sdg_labels:
                         sdgdata = " ".join(df[df.SDG == label].text.to_list())
                         textranklist_ = textrank(textdata=sdgdata, words= top_n)
                         if len(textranklist_) > 0:
                             textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
-                    tRkeywordsDf = pd.DataFrame(textrank_keyword_list)
                     plt.rcParams['font.size'] = 25
@@ -145,7 +144,7 @@ def app():
                     st.write("")
                     st.markdown("###### What keywords are present under SDG classified text? ######")
-                    AgGrid(tRkeywordsDf, reload_data = False,
                             update_mode="value_changed",
                     columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
                     st.write("")

                 file_path = st.session_state['filepath']
                 classifier = load_sdgClassifier(classifier_name=model_name)
                 st.session_state['sdg_classifier'] = classifier
+                all_documents = runSDGPreprocessingPipeline(file_name= file_name,
+                                        file_path= file_path, split_by= split_by,
                                         split_length= split_length,
                 split_respect_sentence_boundary= split_respect_sentence_boundary,
+                split_overlap= split_overlap, remove_punc= remove_punc)
                 if len(all_documents['documents']) > 100:
                     warning_msg = ": This might take sometime, please sit back and relax."
                     df, x = sdg_classification(haystack_doc=all_documents['documents'],
                                                 threshold= threshold)
                     df = df.drop(['Relevancy'], axis = 1)
+                    sdg_labels = x.SDG.unique()
                     textrank_keyword_list = []
                     for label in sdg_labels:
                         sdgdata = " ".join(df[df.SDG == label].text.to_list())
                         textranklist_ = textrank(textdata=sdgdata, words= top_n)
                         if len(textranklist_) > 0:
                             textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
+                    textrank_keywords_df = pd.DataFrame(textrank_keyword_list)
                     plt.rcParams['font.size'] = 25
                     st.write("")
                     st.markdown("###### What keywords are present under SDG classified text? ######")
+                    AgGrid(textrank_keywords_df, reload_data = False,
                             update_mode="value_changed",
                     columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
                     st.write("")

utils/checkconfig.py CHANGED Viewed

@@ -1,12 +1,15 @@
 import configparser
 import logging
-def getconfig(configFilePath):
     config = configparser.ConfigParser()
     try:
-        config.read_file(open(configFilePath))
         return config
     except:
         logging.warning("config file not found")

 import configparser
 import logging
+def getconfig(configfile_path:str):
+    """
+    configfile_path: file path of .cfg file
+    """
     config = configparser.ConfigParser()
     try:
+        config.read_file(open(configfile_path))
         return config
     except:
         logging.warning("config file not found")

utils/keyword_extraction.py CHANGED Viewed

@@ -58,7 +58,7 @@ def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
     return results
-def tfidf_keyword(textdata, vectorizer, tfidfmodel, top_n):
     """
     TFIDF based keywords extraction
@@ -108,7 +108,7 @@ def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
     return keywords
 @st.cache(allow_output_mutation=True)
-def textrank(textdata:Text, ratio:float = 0.1, words = 0):
     """
     wrappper function to perform textrank, uses either ratio or wordcount to
     extract top keywords limited by words or ratio.

     return results
+def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
     """
     TFIDF based keywords extraction
     return keywords
 @st.cache(allow_output_mutation=True)
+def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
     """
     wrappper function to perform textrank, uses either ratio or wordcount to
     extract top keywords limited by words or ratio.

utils/lexical_search.py CHANGED Viewed

@@ -7,7 +7,7 @@ import streamlit as st
 from markdown import markdown
 from annotated_text import annotation
 from haystack.schema import Document
-from typing import List, Text
 from typing_extensions import Literal
 from utils.preprocessing import processingpipeline
 from utils.streamlitcheck import check_streamlit
@@ -23,10 +23,10 @@ except ImportError:
     logging.info("Streamlit not installed")
-def runLexicalPreprocessingPipeline(file_path,file_name,
                         split_by: Literal["sentence", "word"] = 'word',
-                        split_length:int = 80, remove_punc:bool = False,
-                        split_overlap:int = 0 )->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig. As lexical doesnt gets
@@ -40,11 +40,14 @@ def runLexicalPreprocessingPipeline(file_path,file_name,
     st.session_state['filename']
     file_path: filepath, in case of streamlit application use
     st.session_state['filepath']
-    removePunc: to remove all Punctuation including ',' and '.' or not
     split_by: document splitting strategy either as word or sentence
     split_length: when synthetically creating the paragrpahs from document,
                     it defines the length of paragraph.
     splititng of text.
     Return
     --------------
@@ -91,7 +94,8 @@ def tokenize_lexical_query(query:str)-> List[str]:
                   if not (token.is_stop or token.is_punct)]
     return token_list
-def runSpacyMatcher(token_list:List[str], document:Text):
     """
     Using the spacy in backend finds the keywords in the document using the
     Matcher class from spacy. We can alternatively use the regex, but spacy
@@ -203,7 +207,7 @@ def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
     else:
         print(annotated_text)
-def lexical_search(query:Text,top_k:int, documents:List[Document]):
     """
     Performs the Lexical search on the List of haystack documents which is
     returned by preprocessing Pipeline.

 from markdown import markdown
 from annotated_text import annotation
 from haystack.schema import Document
+from typing import List, Text, Tuple
 from typing_extensions import Literal
 from utils.preprocessing import processingpipeline
 from utils.streamlitcheck import check_streamlit
     logging.info("Streamlit not installed")
+def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
                         split_by: Literal["sentence", "word"] = 'word',
+                        split_length:int = 80, split_overlap:int = 0,
+                        remove_punc:bool = False,)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig. As lexical doesnt gets
     st.session_state['filename']
     file_path: filepath, in case of streamlit application use
     st.session_state['filepath']
     split_by: document splitting strategy either as word or sentence
     split_length: when synthetically creating the paragrpahs from document,
                     it defines the length of paragraph.
+    split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
     splititng of text.
+    removePunc: to remove all Punctuation including ',' and '.' or not
     Return
     --------------
                   if not (token.is_stop or token.is_punct)]
     return token_list
+def runSpacyMatcher(token_list:List[str], document:Text
+                    )->Tuple(List[List[int]],spacy.tokens.doc.Doc):
     """
     Using the spacy in backend finds the keywords in the document using the
     Matcher class from spacy. We can alternatively use the regex, but spacy
     else:
         print(annotated_text)
+def lexical_search(query:Text, documents:List[Document],top_k:int):
     """
     Performs the Lexical search on the List of haystack documents which is
     returned by preprocessing Pipeline.

utils/preprocessing.py CHANGED Viewed

@@ -120,7 +120,7 @@ class FileConverter(BaseComponent):
         return
-def basic(s, remove_punc:bool = False):
     """
     Performs basic cleaning of text.
@@ -164,10 +164,10 @@ class UdfPreProcessor(BaseComponent):
     """
     outgoing_edges = 1
-    def run(self, documents:List[Document], remove_punc:bool,
             split_by: Literal["sentence", "word"] = 'sentence',
-            split_respect_sentence_boundary = False,
-            split_length:int = 2, split_overlap:int = 0):
         """ this is required method to invoke the component in
         the pipeline implementation.
@@ -175,7 +175,7 @@ class UdfPreProcessor(BaseComponent):
         Params
         ----------
         documents: documents from the output dictionary returned by Fileconverter
-        removePunc: to remove all Punctuation including ',' and '.' or not
         split_by: document splitting strategy either as word or sentence
         split_length: when synthetically creating the paragrpahs from document,
                       it defines the length of paragraph.

         return
+def basic(s:str, remove_punc:bool = False):
     """
     Performs basic cleaning of text.
     """
     outgoing_edges = 1
+    def run(self, documents:List[Document], remove_punc:bool=False,
             split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0):
         """ this is required method to invoke the component in
         the pipeline implementation.
         Params
         ----------
         documents: documents from the output dictionary returned by Fileconverter
+        remove_punc: to remove all Punctuation including ',' and '.' or not
         split_by: document splitting strategy either as word or sentence
         split_length: when synthetically creating the paragrpahs from document,
                       it defines the length of paragraph.

utils/sdg_classifier.py CHANGED Viewed

@@ -34,7 +34,7 @@ _lab_dict = {0: 'no_cat',
             17:'SDG 17 - Partnership for the goals',}
 @st.cache(allow_output_mutation=True)
-def load_sdgClassifier(config_file = None, classifier_name = None):
     """
     loads the document classifier using haystack, where the name/path of model
     in HF-hub as string is used to fetch the model object.Either configfile or
@@ -44,8 +44,8 @@ def load_sdgClassifier(config_file = None, classifier_name = None):
     Params
     --------
-    configFile: config file from which to read the model name
-    docClassifierModel: if modelname is passed, it takes a priority if not \
     found then will look for configfile, else raise error.
@@ -69,7 +69,9 @@ def load_sdgClassifier(config_file = None, classifier_name = None):
 @st.cache(allow_output_mutation=True)
 def sdg_classification(haystack_doc:List[Document],
-                        threshold:float, classifier_model= None)->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
     most appropriate label for each text. these labels are in terms of if text
@@ -77,12 +79,13 @@ def sdg_classification(haystack_doc:List[Document],
     Params
     ---------
-    haystackdoc: List of haystack Documents. The output of Preprocessing Pipeline
     contains the list of paragraphs in different format,here the list of
     Haystack Documents is used.
     threshold: threshold value for the model to keep the results from classifier
-    classifiermodel: you can pass the classifier model directly, however in case of
-    streamlit avoid it.
     Returns
@@ -117,7 +120,7 @@ def sdg_classification(haystack_doc:List[Document],
     x = x.rename('count')
     x = x.rename_axis('SDG').reset_index()
     x["SDG"] = pd.to_numeric(x["SDG"])
-    x = x.sort_values(by=['count'])
     x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
     x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
@@ -126,11 +129,10 @@ def sdg_classification(haystack_doc:List[Document],
     return df, x
-def runSDGPreprocessingPipeline(filePath, fileName,
             split_by: Literal["sentence", "word"] = 'sentence',
-            split_respect_sentence_boundary = False,
-            split_length:int = 2, split_overlap = 0,
-            remove_punc = False)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
@@ -140,13 +142,16 @@ def runSDGPreprocessingPipeline(filePath, fileName,
     file_name: filename, in case of streamlit application use
     st.session_state['filename']
-    file_path: filepath, in case of streamlit application use
-    removePunc: to remove all Punctuation including ',' and '.' or not
     split_by: document splitting strategy either as word or sentence
     split_length: when synthetically creating the paragrpahs from document,
                     it defines the length of paragraph.
     split_respect_sentence_boundary: Used when using 'word' strategy for
     splititng of text.
     Return
@@ -160,9 +165,9 @@ def runSDGPreprocessingPipeline(filePath, fileName,
     sdg_processing_pipeline = processingpipeline()
-    output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
-                            params= {"FileConverter": {"file_path": filePath, \
-                                        "file_name": fileName},
                                      "UdfPreProcessor": {"remove_punc": remove_punc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\

             17:'SDG 17 - Partnership for the goals',}
 @st.cache(allow_output_mutation=True)
+def load_sdgClassifier(config_file:str = None, classifier_name:str = None):
     """
     loads the document classifier using haystack, where the name/path of model
     in HF-hub as string is used to fetch the model object.Either configfile or
     Params
     --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
     found then will look for configfile, else raise error.
 @st.cache(allow_output_mutation=True)
 def sdg_classification(haystack_doc:List[Document],
+                        threshold:float = 0.8,
+                        classifier_model:TransformersDocumentClassifier= None
+                        )->Tuple[DataFrame,Series]:
     """
     Text-Classification on the list of texts provided. Classifier provides the
     most appropriate label for each text. these labels are in terms of if text
     Params
     ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
     contains the list of paragraphs in different format,here the list of
     Haystack Documents is used.
     threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
     Returns
     x = x.rename('count')
     x = x.rename_axis('SDG').reset_index()
     x["SDG"] = pd.to_numeric(x["SDG"])
+    x = x.sort_values(by=['count'], ascending=False)
     x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
     x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
     return df, x
+def runSDGPreprocessingPipeline(file_name:str, file_path:str,
             split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
     file_name: filename, in case of streamlit application use
     st.session_state['filename']
+    file_path: filepath, in case of streamlit application use st.session_state['filepath']
     split_by: document splitting strategy either as word or sentence
     split_length: when synthetically creating the paragrpahs from document,
                     it defines the length of paragraph.
     split_respect_sentence_boundary: Used when using 'word' strategy for
     splititng of text.
+    split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+    remove_punc: to remove all Punctuation including ',' and '.' or not
     Return
     sdg_processing_pipeline = processingpipeline()
+    output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
                                      "UdfPreProcessor": {"remove_punc": remove_punc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\

utils/semantic_search.py CHANGED Viewed

@@ -1,15 +1,16 @@
-from haystack.nodes import TransformersQueryClassifier
 from haystack.nodes import EmbeddingRetriever, FARMReader
 from haystack.nodes.base import BaseComponent
 from haystack.document_stores import InMemoryDocumentStore
 from markdown import markdown
 from annotated_text import annotation
 from haystack.schema import Document
-from typing import List, Text
 from typing_extensions import Literal
 from utils.preprocessing import processingpipeline
 from utils.streamlitcheck import check_streamlit
 from haystack.pipelines import Pipeline
 import logging
 try:
     from termcolor import colored
@@ -37,9 +38,13 @@ class QueryCheck(BaseComponent):
     Uses Query Classifier from Haystack, process the query based on query type.
     Ability to determine the statements is not so good, therefore the chances
     statement also get modified. Ex: "List water related issues" will be
-    identified by the model as keywords, and therefore it be processed as "what are
-    the 'list all water related issues' related issues and discussions?". This is one shortcoming
-    but is igonred for now, as semantic search will not get affected a lot, by this.
     1. https://docs.haystack.deepset.ai/docs/query_classifier
@@ -47,11 +52,22 @@ class QueryCheck(BaseComponent):
     outgoing_edges = 1
-    def run(self, query):
         """
-        mandatory method to use the cusotm node. Determines the query type, if
         if the query is of type keyword/statement will modify it to make it more
         useful for sentence transoformers.
         """
         query_classifier = loadQueryClassifier()
@@ -61,20 +77,51 @@ class QueryCheck(BaseComponent):
             output = {"query":query,
                        "query_type": 'question/statement'}
         else:
-            output = {"query": "what are the {} related issues and discussions?".format(query),
                       "query_type": 'statements/keyword'}
         logging.info(output)
         return output, "output_1"
-    def run_batch(self, query):
-        pass
 @st.cache(allow_output_mutation=True)
-def runSemanticPreprocessingPipeline(file_path, file_name,
                 split_by: Literal["sentence", "word"] = 'sentence',
-                split_respect_sentence_boundary = False,
-                split_length:int = 2, split_overlap = 0,
-                remove_punc = False)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline.
@@ -82,22 +129,25 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
     ------------
     file_name: filename, in case of streamlit application use
-    st.session_state['filename']
     file_path: filepath, in case of streamlit application use
-    st.session_state['filepath']
-    removePunc: to remove all Punctuation including ',' and '.' or not
     split_by: document splitting strategy either as word or sentence
     split_length: when synthetically creating the paragrpahs from document,
-                    it defines the length of paragraph.
     split_respect_sentence_boundary: Used when using 'word' strategy for
-    splititng of text.
     Return
     --------------
     List[Document]: When preprocessing pipeline is run, the output dictionary
-    has four objects. For the Haysatck implementation of semantic search we,
-    need to use the List of Haystack Document, which can be fetched by
-    key = 'documents' on output.
     """
@@ -106,7 +156,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
     output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
-                                        "UdfPreProcessor": {"remove_punc": remove_punc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap,
@@ -115,10 +165,11 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
     return output_semantic_pre
-@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
-def loadRetriever(embedding_model:Text =  None, embedding_model_format:Text = None,
                  embedding_layer:int = None,  retriever_top_k:int = 10,
-                 max_seq_len:int = 512, document_store:InMemoryDocumentStore = None):
     """
     Returns the Retriever model based on params provided.
     1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
@@ -129,14 +180,16 @@ def loadRetriever(embedding_model:Text =  None, embedding_model_format:Text = No
     Params
     ---------
     embedding_model: Name of the model to be used for embedding. Check the links
-    provided in documentation
-    embedding_model_format: check the github link of Haystack provided in documentation
-    embedding_layer: check the github link of Haystack provided in documentation
-    retriever_top_k: Number of Top results to be returned by retriever
-    max_seq_len: everymodel has max seq len it can handle, check in model card.
-    Needed to hanlde the edge cases.
-    document_store: InMemoryDocumentStore, write haystack Document list to DocumentStore
-    and pass the same to function call. Can be done using createDocumentStore from utils.
     Return
     -------
@@ -157,7 +210,8 @@ def loadRetriever(embedding_model:Text =  None, embedding_model_format:Text = No
         st.session_state['retriever'] = retriever
     return retriever
-@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
                         embedding_dim:int = 768):
     """
@@ -167,11 +221,11 @@ def createDocumentStore(documents:List[Document], similarity:str = 'dot_product'
     Params
     -------
     documents: List of haystack document. If using the preprocessing pipeline,
-    can be fetched key = 'documents; on output of preprocessing pipeline.
     similarity: scoring function, can be either 'cosine' or 'dot_product'
     embedding_dim: Document store has default value of embedding size = 768, and
-    update_embeddings method of Docstore cannot infer the embedding size of
-    retiever automaticallu, therefore set this value as per the model card.
     Return
     -------
@@ -185,13 +239,13 @@ def createDocumentStore(documents:List[Document], similarity:str = 'dot_product'
     return document_store
-@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  None,
-                useQueryCheck = True, embedding_model_format:Text = None,
-                max_seq_len:int =512,embedding_dim:int = 768,
-                 embedding_layer:int = None,  retriever_top_k:int = 10,
-                 reader_model:str =  None, reader_top_k:int = 10
-                 ):
     """
     creates the semantic search pipeline and document Store object from the
     list of haystack documents. The top_k for the Reader and Retirever are kept
@@ -201,6 +255,14 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  Non
     and to some extent extractive QA purpose. The purpose of Reader is strictly to
     highlight the context for retrieved result and not for QA, however as stated
     it can work for QA too in limited sense.
     1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
     2. https://www.sbert.net/examples/applications/semantic-search/README.html
@@ -208,37 +270,39 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  Non
     4. https://docs.haystack.deepset.ai/docs/reader
     Params
     ----------
     documents: list of Haystack Documents, returned by preprocessig pipeline.
     embedding_model: Name of the model to be used for embedding. Check the links
-    provided in documentation
-    embedding_model_format: check the github link of Haystack provided in documentation
     embedding_layer: check the github link of Haystack provided in documentation
     retriever_top_k: Number of Top results to be returned by retriever
     reader_model: Name of the model to be used for Reader node in hasyatck
-    Pipeline. Check the links provided in documentation
     reader_top_k: Reader will use retrieved results to further find better matches.
-                As purpose here is to use reader to extract context, the value is
-                same as retriever_top_k.
-    useQueryCheck: Whether to use the querycheck which modifies the query or not.
-    embedding_dim: Document store has default value of embedding size = 768, and
-    update_embeddings method of Docstore cannot infer the embedding size of
-    retiever automaticallu, therefore set this value as per the model card.
     max_seq_len:everymodel has max seq len it can handle, check in model card.
-    Needed to hanlde the edge cases
     Return
     ---------
     semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
-    nodes [QueryCheck, Retriever, Reader]
     document_store: As retriever can work only with Haystack Document Store, the
-    list of document returned by preprocessing pipeline are fed into to get
-    InMemmoryDocumentStore object type, with retriever updating the embedding
-    embeddings of each paragraph in document store.
     """
     document_store = createDocumentStore(documents=documents,
@@ -248,34 +312,187 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  Non
                     embedding_layer=embedding_layer,
                     retriever_top_k= retriever_top_k,
                     document_store = document_store,
-                    max_seq_len=max_seq_len)
     document_store.update_embeddings(retriever)
-    reader = FARMReader(model_name_or_path=reader_model,
-                    top_k = reader_top_k, use_gpu=True)
     semantic_search_pipeline = Pipeline()
     if useQueryCheck and reader_model:
         querycheck = QueryCheck()
-        semantic_search_pipeline.add_node(component = querycheck, name = "QueryCheck",
-                                        inputs = ["Query"])
-        semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
-                                        inputs = ["QueryCheck.output_1"])
         semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
                                         inputs= ["EmbeddingRetriever"])
     elif reader_model :
-        semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
-                                        inputs = ["Query"])
-        semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
-                                        inputs= ["EmbeddingRetriever"])
     else:
-        semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
-                                        inputs = ["Query"])
-    return semantic_search_pipeline, document_store
-def semanticsearchAnnotator(matches: List[List[int]], document):
     """
     Annotates the text in the document defined by list of [start index, end index]
     Example: "How are you today", if document type is text, matches = [[0,3]]
@@ -311,12 +528,14 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
         print(annotated_text)
-def semantic_keywordsearch(query:Text,documents:List[Document],embedding_model:Text,
                 embedding_model_format:Text,
-                 embedding_layer:int,  reader_model:str,
-                 retriever_top_k:int = 10, reader_top_k:int = 10,
-                 return_results:bool = False, embedding_dim:int = 768,
-                 max_seq_len:int = 512):
     """
     Performs the Semantic search on the List of haystack documents which is
     returned by preprocessing Pipeline.
@@ -327,7 +546,7 @@ def semantic_keywordsearch(query:Text,documents:List[Document],embedding_model:T
     documents: List fo Haystack documents returned by preprocessing pipeline.
     """
-    semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
                         embedding_model= embedding_model,
                         embedding_layer= embedding_layer,
                         embedding_model_format= embedding_model_format,
@@ -335,22 +554,24 @@ def semantic_keywordsearch(query:Text,documents:List[Document],embedding_model:T
                         reader_top_k= reader_top_k, embedding_dim=embedding_dim,
                         max_seq_len=max_seq_len)
-    results = semanticsearch_pipeline.run(query = query)
     if return_results:
-        return results
     else:
         if check_streamlit:
             st.markdown("##### Top few semantic search results #####")
         else:
             print("Top few semantic search results")
-        for i,answer in enumerate(results['answers']):
-            temp = answer.to_dict()
-            doc = doc_store.get_document_by_id(temp['document_id']).content
-            start_idx = doc.find(temp['context'])
-            end_idx = start_idx + len(temp['context'])
-            match = [[start_idx,end_idx]]
             if check_streamlit:
                 st.write("Result {}".format(i+1))
             else:
                 print("Result {}".format(i+1))
-            semanticsearchAnnotator(match, doc)

+from haystack.nodes import TransformersQueryClassifier, Docs2Answers
 from haystack.nodes import EmbeddingRetriever, FARMReader
 from haystack.nodes.base import BaseComponent
 from haystack.document_stores import InMemoryDocumentStore
 from markdown import markdown
 from annotated_text import annotation
 from haystack.schema import Document
+from typing import List, Text, Union
 from typing_extensions import Literal
 from utils.preprocessing import processingpipeline
 from utils.streamlitcheck import check_streamlit
 from haystack.pipelines import Pipeline
+import pandas as pd
 import logging
 try:
     from termcolor import colored
     Uses Query Classifier from Haystack, process the query based on query type.
     Ability to determine the statements is not so good, therefore the chances
     statement also get modified. Ex: "List water related issues" will be
+    identified by the model as keywords, and therefore it be processed as "what
+    are the 'list all water related issues' related issues and discussions?".
+    This is one shortcoming but is igonred for now, as semantic search will not
+    get affected a lot, by this. If you want to pass keywords list and want to
+    do batch processing use. run_batch. Example: if you want to find relevant
+    passages for water, food security, poverty then querylist = ["water", "food
+    security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
     1. https://docs.haystack.deepset.ai/docs/query_classifier
     outgoing_edges = 1
+    def run(self, query:str):
         """
+        mandatory method to use the custom node. Determines the query type, if
         if the query is of type keyword/statement will modify it to make it more
         useful for sentence transoformers.
+        Params
+        --------
+        query: query/statement/keywords in form of string
+        Return
+        ------
+        output: dictionary, with key as identifier and value could be anything
+                we need to return. In this case the output contain key = 'query'.
+        output_1: As there is only one outgoing edge, we pass 'output_1' string
         """
         query_classifier = loadQueryClassifier()
             output = {"query":query,
                        "query_type": 'question/statement'}
         else:
+            output = {"query": "what are the {} related issues and \
+                        discussions?".format(query),
                       "query_type": 'statements/keyword'}
         logging.info(output)
         return output, "output_1"
+    def run_batch(self, queries:List[str]):
+        """
+        running multiple queries in one go, howeevr need the queries to be passed
+        as list of string. Example: if you want to find relevant passages for
+        water, food security, poverty then querylist = ["water", "food security",
+        "poverty"] and then execute QueryCheck.run_batch(queries = querylist)
+        Params
+        --------
+        queries: queries/statements/keywords in form of string encapsulated
+                within List
+        Return
+        ------
+        output: dictionary, with key as identifier and value could be anything
+                we need to return. In this case the output contain key = 'queries'.
+        output_1: As there is only one outgoing edge, we pass 'output_1' string
+        """
+        query_classifier = loadQueryClassifier()
+        query_list = []
+        for query in queries:
+            result = query_classifier.run(query=query)
+            if result[1] == "output_1":
+                query_list.append(query)
+            else:
+                query_list.append("what are the {} related issues and \
+                    discussions?".format(query))
+        output = {'queries':query_list}
+        logging.info(output)
+        return output, "output_1"
 @st.cache(allow_output_mutation=True)
+def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
                 split_by: Literal["sentence", "word"] = 'sentence',
+                split_length:int = 2, split_overlap:int = 0,
+                split_respect_sentence_boundary:bool = False,
+                remove_punc:bool = False)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline.
     ------------
     file_name: filename, in case of streamlit application use
+            st.session_state['filename']
     file_path: filepath, in case of streamlit application use
+            st.session_state['filepath']
     split_by: document splitting strategy either as word or sentence
     split_length: when synthetically creating the paragrpahs from document,
+            it defines the length of paragraph.
+    split_overlap: Number of words or sentences that overlap when creating the
+            paragraphs. This is done as one sentence or 'some words' make sense
+            when  read in together with others. Therefore the overlap is used.
     split_respect_sentence_boundary: Used when using 'word' strategy for
+            splititng of text.
+    remove_punc: to remove all Punctuation including ',' and '.' or not
     Return
     --------------
     List[Document]: When preprocessing pipeline is run, the output dictionary
+        has four objects. For the Haysatck implementation of semantic search we,
+        need to use the List of Haystack Document, which can be fetched by
+        key = 'documents' on output.
     """
     output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
+                                "UdfPreProcessor": {"remove_punc": remove_punc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap,
     return output_semantic_pre
+@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
+                                        allow_output_mutation=True)
+def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
                  embedding_layer:int = None,  retriever_top_k:int = 10,
+                 max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
     """
     Returns the Retriever model based on params provided.
     1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
     Params
     ---------
     embedding_model: Name of the model to be used for embedding. Check the links
+            provided in documentation
+    embedding_model_format: check the github link of Haystack provided in
+            documentation embedding_layer: check the github link of Haystack
+            provided in documentation retriever_top_k: Number of Top results to
+            be returned by
+    retriever max_seq_len: everymodel has max seq len it can handle, check in
+            model card. Needed to hanlde the edge cases.
+    document_store: InMemoryDocumentStore, write haystack Document list to
+            DocumentStore and pass the same to function call. Can be done using
+            createDocumentStore from utils.
     Return
     -------
         st.session_state['retriever'] = retriever
     return retriever
+@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
+                    allow_output_mutation=True)
 def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
                         embedding_dim:int = 768):
     """
     Params
     -------
     documents: List of haystack document. If using the preprocessing pipeline,
+            can be fetched key = 'documents; on output of preprocessing pipeline.
     similarity: scoring function, can be either 'cosine' or 'dot_product'
     embedding_dim: Document store has default value of embedding size = 768, and
+            update_embeddings method of Docstore cannot infer the embedding size of
+            retiever automatically, therefore set this value as per the model card.
     Return
     -------
     return document_store
+@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
+                                        allow_output_mutation=True)
 def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  None,
+                embedding_model_format:Text = None,embedding_layer:int = None,
+                embedding_dim:int = 768,retriever_top_k:int = 10,
+                reader_model:str =  None, reader_top_k:int = 10,
+                max_seq_len:int =512,useQueryCheck = True, ):
     """
     creates the semantic search pipeline and document Store object from the
     list of haystack documents. The top_k for the Reader and Retirever are kept
     and to some extent extractive QA purpose. The purpose of Reader is strictly to
     highlight the context for retrieved result and not for QA, however as stated
     it can work for QA too in limited sense.
+    There are 4 variants of pipeline it can return
+    1.QueryCheck > Retriever > Reader
+    2.Retriever > Reader
+    3.QueryCheck > Retriever > Docs2Answers : If reader is None,
+    then Doc2answer is used to keep the output of pipeline structurally same.
+    4.Retriever > Docs2Answers
+    Links
     1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
     2. https://www.sbert.net/examples/applications/semantic-search/README.html
     4. https://docs.haystack.deepset.ai/docs/reader
     Params
     ----------
     documents: list of Haystack Documents, returned by preprocessig pipeline.
     embedding_model: Name of the model to be used for embedding. Check the links
+            provided in documentation
+    embedding_model_format: check the github link of Haystack provided in
+            documentation
     embedding_layer: check the github link of Haystack provided in documentation
+    embedding_dim: Document store has default value of embedding size = 768, and
+            update_embeddings method of Docstore cannot infer the embedding size of
+            retiever automatically, therefore set this value as per the model card.
     retriever_top_k: Number of Top results to be returned by retriever
     reader_model: Name of the model to be used for Reader node in hasyatck
+            Pipeline. Check the links provided in documentation
     reader_top_k: Reader will use retrieved results to further find better matches.
+            As purpose here is to use reader to extract context, the value is
+            same as retriever_top_k.
     max_seq_len:everymodel has max seq len it can handle, check in model card.
+            Needed to hanlde the edge cases
+    useQueryCheck: Whether to use the querycheck which modifies the query or not.
     Return
     ---------
     semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
+            nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
+            then Doc2answer is used to keep the output of pipeline structurally
+            same.
     document_store: As retriever can work only with Haystack Document Store, the
+            list of document returned by preprocessing pipeline are fed into to
+            get InMemmoryDocumentStore object type, with retriever updating the
+            embeddings of each paragraph in document store.
     """
     document_store = createDocumentStore(documents=documents,
                     embedding_layer=embedding_layer,
                     retriever_top_k= retriever_top_k,
                     document_store = document_store,
+                    max_seq_len=max_seq_len)
     document_store.update_embeddings(retriever)
     semantic_search_pipeline = Pipeline()
     if useQueryCheck and reader_model:
         querycheck = QueryCheck()
+        reader = FARMReader(model_name_or_path=reader_model,
+                    top_k = reader_top_k, use_gpu=True)
+        semantic_search_pipeline.add_node(component = querycheck,
+                    name = "QueryCheck",inputs = ["Query"])
+        semantic_search_pipeline.add_node(component = retriever,
+                    name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
         semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
                                         inputs= ["EmbeddingRetriever"])
     elif reader_model :
+        reader = FARMReader(model_name_or_path=reader_model,
+                    top_k = reader_top_k, use_gpu=True)
+        semantic_search_pipeline.add_node(component = retriever,
+                    name = "EmbeddingRetriever",inputs = ["Query"])
+        semantic_search_pipeline.add_node(component = reader,
+                    name = "FARMReader",inputs= ["EmbeddingRetriever"])
+    elif useQueryCheck and not reader_model:
+        querycheck = QueryCheck()
+        docs2answers = Docs2Answers()
+        semantic_search_pipeline.add_node(component = querycheck,
+                        name = "QueryCheck",inputs = ["Query"])
+        semantic_search_pipeline.add_node(component = retriever,
+                        name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
+        semantic_search_pipeline.add_node(component = docs2answers,
+                        name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
+    elif not useQueryCheck and not reader_model:
+        docs2answers = Docs2Answers()
+        semantic_search_pipeline.add_node(component = retriever,
+                        name = "EmbeddingRetriever",inputs = ["Query"])
+        semantic_search_pipeline.add_node(component = docs2answers,
+                        name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
+    logging.info(semantic_search_pipeline.components)
+    return semantic_search_pipeline, document_store
+def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
+    """
+    will use the haystack run or run_batch based on if single query is passed
+    as string or multiple queries as List[str]
+    Params
+    -------
+    pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
+            from utils.semanticsearch
+    queries: Either a single query or list of queries.
+    Return
+    -------
+    results: Dict containing answers and documents as key and their respective
+            values
+    """
+    if type(queries) == list:
+        results = pipeline.run_batch(queries=queries)
+    elif type(queries) == str:
+        results = pipeline.run(query=queries)
     else:
+        logging.info("Please check the input type for the queries")
+        return
+    return results
+def process_query_output(results:dict)->pd.DataFrame:
+    """
+    Returns the dataframe with necessary information like including
+    ['query','answer','answer_offset','context_offset','context','content',
+    'reader_score','retriever_score','id',]. This is designed for output given
+    by semantic search pipeline with single query and final node as reader.
+    The output of pipeline having Docs2Answers as final node or multiple queries
+    need to be handled separately. In these other cases, use process_semantic_output
+    from utils.semantic_search which uses this function internally to make one
+    combined dataframe.
+    Params
+    ---------
+    results: this dictionary should have key,values with
+            keys = [query,answers,documents], however answers is optional.
+            in case of [Doc2Answers as final node], process_semantic_output
+            doesnt return answers thereby setting all values contained in
+            answers to 'None'
+    Return
+    --------
+    df: dataframe with all the columns mentioned in function description.
+    """
+    query_text = results['query']
+    if 'answers' in results.keys():
+        answer_dict = {}
+        for answer in results['answers']:
+            answer_dict[answer.document_id] = answer.to_dict()
+    else:
+        answer_dict = {}
+    docs = results['documents']
+    df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
+                            'context','content','reader_score','retriever_score',
+                            'id'])
+    for doc in docs:
+        row_list = {}
+        row_list['query'] = query_text
+        row_list['retriever_score'] = doc.score
+        row_list['id'] = doc.id
+        row_list['content'] = doc.content
+        if doc.id in answer_dict.keys():
+            row_list['answer'] = answer_dict[doc.id]['answer']
+            row_list['context'] = answer_dict[doc.id]['context']
+            row_list['reader_score'] = answer_dict[doc.id]['score']
+            answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
+            row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
+            start_idx = doc.content.find(row_list['context'])
+            end_idx = start_idx + len(row_list['context'])
+            row_list['context_offset'] = [start_idx, end_idx]
+        else:
+            row_list['answer'] = None
+            row_list['context'] = None
+            row_list['reader_score'] = None
+            row_list['answer_offset'] = None
+            row_list['context_offset'] = None
+        df_dictionary = pd.DataFrame([row_list])
+        df = pd.concat([df, df_dictionary], ignore_index=True)
+    return df
+def process_semantic_output(results):
+    """
+    Returns the dataframe with necessary information like including
+    ['query','answer','answer_offset','context_offset','context','content',
+    'reader_score','retriever_score','id',]. Distingushes if its single query or
+    multi queries by reading the pipeline output dictionary keys.
+    Uses the process_query_output to get the dataframe for each query and create
+    one concataneted dataframe. In case f Docs2Answers as final node, deletes
+    the answers part. See documentations of process_query_output.
+    Params
+    ---------
+    results: raw output of runSemanticPipeline.
+    Return
+    --------
+    df: dataframe with all the columns mentioned in function description.
+    """
+    output = {}
+    if 'query' in results.keys():
+        output['query'] = results['query']
+        output['documents'] = results['documents']
+        if results['node_id'] == 'Docs2Answers':
+            pass
+        else:
+            output['answers'] = results['answers']
+        df = process_query_output(output)
+        return df
+    if 'queries' in results.keys():
+        df = pd.DataFrame(columns=['query','answer','answer_offset',
+                                   'context_offset','context','content',
+                                   'reader_score','retriever_score','id'])
+        for query,answers,documents in zip(results['queries'],
+                    results['answers'],results['documents']):
+            output = {}
+            output['query'] = query
+            output['documents'] = documents
+            if results['node_id'] == 'Docs2Answers':
+                    pass
+            else:
+                output['answers'] = answers
+            temp = process_query_output(output)
+            df = pd.concat([df, temp], ignore_index=True)
+    return df
+def semanticsearchAnnotator(matches:List[List[int]], document:Text):
     """
     Annotates the text in the document defined by list of [start index, end index]
     Example: "How are you today", if document type is text, matches = [[0,3]]
         print(annotated_text)
+def semantic_keywordsearch(query:Text,documents:List[Document],
+                embedding_model:Text,
                 embedding_model_format:Text,
+                embedding_layer:int,  reader_model:str,
+                retriever_top_k:int = 10, reader_top_k:int = 10,
+                return_results:bool = False, embedding_dim:int = 768,
+                max_seq_len:int = 512,
+                sort_by:Literal["retriever", "reader"] = 'retriever'):
     """
     Performs the Semantic search on the List of haystack documents which is
     returned by preprocessing Pipeline.
     documents: List fo Haystack documents returned by preprocessing pipeline.
     """
+    semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
                         embedding_model= embedding_model,
                         embedding_layer= embedding_layer,
                         embedding_model_format= embedding_model_format,
                         reader_top_k= reader_top_k, embedding_dim=embedding_dim,
                         max_seq_len=max_seq_len)
+    raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
+    results_df = process_semantic_output(raw_output)
+    if sort_by == 'retriever':
+        results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
+    else:
+        results_df = results_df.sort_values(by=['reader_score'], ascending=False)
     if return_results:
+        return results_df
     else:
         if check_streamlit:
             st.markdown("##### Top few semantic search results #####")
         else:
             print("Top few semantic search results")
+        for i in range(len(results_df)):
             if check_streamlit:
                 st.write("Result {}".format(i+1))
             else:
                 print("Result {}".format(i+1))
+            semanticsearchAnnotator(results_df.loc[i]['context_offset'],
+                        results_df.loc[i]['content'] )