Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

prashant commited on Nov 7, 2022

Commit

048a702

•

1 Parent(s): 4d2be28

decorator test

Browse files

Files changed (6) hide show

app.py +2 -2
appStore/keyword_search.py +6 -0
utils/preprocessing.py +3 -2
utils/sdg_classifier.py +19 -5
utils/semantic_search.py +28 -11
utils/streamlitcheck.py +19 -0

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import appStore.keyword_search as keyword_search
 import appStore.sdg_analysis as sdg_analysis
 #import appStore.coherence as coherence
 import appStore.info as info
@@ -12,6 +12,6 @@ app = MultiApp()
 app.add_app("About","house", info.app)
 app.add_app("SDG Analysis","gear",sdg_analysis.app)
-app.add_app("Search","search", keyword_search.app)
 app.run()

+# import appStore.keyword_search as keyword_search
 import appStore.sdg_analysis as sdg_analysis
 #import appStore.coherence as coherence
 import appStore.info as info
 app.add_app("About","house", info.app)
 app.add_app("SDG Analysis","gear",sdg_analysis.app)
+# app.add_app("Search","search", keyword_search.app)
 app.run()

appStore/keyword_search.py CHANGED Viewed

@@ -89,4 +89,10 @@ def app():
                         with st.spinner("Performing Similar/Contextual search"):
                             semantic_search(queryList,paraList)

                         with st.spinner("Performing Similar/Contextual search"):
                             semantic_search(queryList,paraList)
+                else:
+                    st.info("🤔 No document found, please try to upload it at the sidebar!")
+                    logging.warning("Terminated as no document provided")

utils/preprocessing.py CHANGED Viewed

@@ -23,7 +23,7 @@ def useOCR(file_path: str)-> Text:
     file_path: file_path of uploade file, returned by add_upload function in
     uploadAndExample.py
-    Returns the text files as string.
     """
@@ -242,7 +242,8 @@ class UdfPreProcessor(BaseComponent):
 def processingpipeline():
     """
-    Returns the preprocessing pipeline
     """

     file_path: file_path of uploade file, returned by add_upload function in
     uploadAndExample.py
+    Returns the text file as string.
     """
 def processingpipeline():
     """
+    Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
+    from utils.
     """

utils/sdg_classifier.py CHANGED Viewed

@@ -3,13 +3,14 @@ from haystack.schema import Document
 from typing import List, Tuple
 import configparser
 import streamlit as st
 from pandas import DataFrame, Series
 import logging
 from utils.preprocessing import processingpipeline
 config = configparser.ConfigParser()
 config.read_file(open('paramconfig.cfg'))
-@st.cache(allow_output_mutation=True)
 def load_sdgClassifier():
     """
     loads the document classifier using haystack, where the name/path of model
@@ -49,11 +50,14 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
     logging.info("running SDG classifiication")
     threshold = float(config.get('sdg','THRESHOLD'))
-    classifier = load_sdgClassifier()
     results = classifier.predict(haystackdoc)
     labels_= [(l.meta['classification']['label'],
                l.meta['classification']['score'],l.content,) for l in results]
@@ -68,10 +72,19 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
     return df, x
-def runSDGPreprocessingPipeline()->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
     Return
     --------------
@@ -81,6 +94,7 @@ def runSDGPreprocessingPipeline()->List[Document]:
     key = 'documents' on output.
     """
     file_path = st.session_state['filepath']
     file_name = st.session_state['filename']
     sdg_processing_pipeline = processingpipeline()

 from typing import List, Tuple
 import configparser
 import streamlit as st
+from utils.streamlitcheck import check_streamlit
 from pandas import DataFrame, Series
 import logging
 from utils.preprocessing import processingpipeline
 config = configparser.ConfigParser()
 config.read_file(open('paramconfig.cfg'))
 def load_sdgClassifier():
     """
     loads the document classifier using haystack, where the name/path of model
     logging.info("running SDG classifiication")
     threshold = float(config.get('sdg','THRESHOLD'))
+    if check_streamlit():
+        st.write("caching model")
+        classifier = st.cache(load_sdgClassifier(), allow_output_mutation=True)
+    else:
+        classifier = load_sdgClassifier()
     results = classifier.predict(haystackdoc)
     labels_= [(l.meta['classification']['label'],
                l.meta['classification']['score'],l.content,) for l in results]
     return df, x
+def runSDGPreprocessingPipeline(file_path = None, file_name = None)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
     the params for pipeline are fetched from paramconfig
+    Param
+    ------------
+    file_path: filepath, if not given will check for file_path in streamlit
+    session_state, else will return
+    file_name: filename, if not given will check for file_name in streamlit
+    session_state
     Return
     --------------
     key = 'documents' on output.
     """
+    # if file_path:
     file_path = st.session_state['filepath']
     file_name = st.session_state['filename']
     sdg_processing_pipeline = processingpipeline()

utils/semantic_search.py CHANGED Viewed

@@ -107,19 +107,25 @@ def semanticSearchPipeline(documents:List[Document]):
             document_store = InMemoryDocumentStore()
             document_store.write_documents(documents)
             embedding_model = config.get('semantic_search','RETRIEVER')
             embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
-            embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
-            retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
-            retriever = EmbeddingRetriever(
-                document_store=document_store,
-                embedding_model=embedding_model,top_k = retriever_top_k,
-                emb_extraction_layer=embedding_layer, scale_score =True,
-                model_format=embedding_model_format, use_gpu = True)
-            document_store.update_embeddings(retriever)
-        else:
             retriever = EmbeddingRetriever(
                 document_store=document_store,
                 embedding_model=embedding_model,top_k = retriever_top_k,
@@ -134,13 +140,24 @@ def semanticSearchPipeline(documents:List[Document]):
         embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
         embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
         retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
         retriever = EmbeddingRetriever(
             document_store=document_store,
             embedding_model=embedding_model,top_k = retriever_top_k,
             emb_extraction_layer=embedding_layer, scale_score =True,
             model_format=embedding_model_format, use_gpu = True)
         document_store.update_embeddings(retriever)
         st.session_state['document_store'] = document_store
     querycheck = QueryCheck()

             document_store = InMemoryDocumentStore()
             document_store.write_documents(documents)
+            if 'retriever' in st.session_state:
+                retriever = st.session_state['retriever']
+                document_store.update_embeddings(retriever)
+                # querycheck =
+            # embedding_model = config.get('semantic_search','RETRIEVER')
+            # embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
+            # embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
+            # retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
+            # retriever = EmbeddingRetriever(
+            #     document_store=document_store,
+            #     embedding_model=embedding_model,top_k = retriever_top_k,
+            #     emb_extraction_layer=embedding_layer, scale_score =True,
+            #     model_format=embedding_model_format, use_gpu = True)
+            # document_store.update_embeddings(retriever)
+        else:
             embedding_model = config.get('semantic_search','RETRIEVER')
             embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
             retriever = EmbeddingRetriever(
                 document_store=document_store,
                 embedding_model=embedding_model,top_k = retriever_top_k,
         embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
         embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
         retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
         retriever = EmbeddingRetriever(
             document_store=document_store,
             embedding_model=embedding_model,top_k = retriever_top_k,
             emb_extraction_layer=embedding_layer, scale_score =True,
             model_format=embedding_model_format, use_gpu = True)
+        st.session_state['retriever'] = retriever
         document_store.update_embeddings(retriever)
         st.session_state['document_store'] = document_store
+        querycheck = QueryCheck()
+        st.session_state['querycheck'] = querycheck
+        reader_model = config.get('semantic_search','READER')
+        reader_top_k = retriever_top_k
+        reader = FARMReader(model_name_or_path=reader_model,
+                        top_k = reader_top_k, use_gpu=True)
+        st.session_state['reader'] = reader
     querycheck = QueryCheck()

utils/streamlitcheck.py ADDED Viewed

	@@ -0,0 +1,19 @@

+def check_streamlit():
+    """
+    Function to check whether python code is run within streamlit
+    Returns
+    -------
+    use_streamlit : boolean
+        True if code is run within streamlit, else False
+    """
+    try:
+        from streamlit.script_run_context import get_script_run_ctx
+        if not get_script_run_ctx():
+            use_streamlit = False
+        else:
+            use_streamlit = True
+    except ModuleNotFoundError:
+        use_streamlit = False
+    return use_streamlit