prashant
commited on
Commit
•
9f55059
1
Parent(s):
949b596
refactoring semantic pep edits in other
Browse files- appStore/multiapp.py +1 -1
- appStore/sdg_analysis.py +6 -7
- utils/checkconfig.py +5 -2
- utils/keyword_extraction.py +2 -2
- utils/lexical_search.py +11 -7
- utils/preprocessing.py +5 -5
- utils/sdg_classifier.py +22 -17
- utils/semantic_search.py +311 -90
appStore/multiapp.py
CHANGED
@@ -46,7 +46,7 @@ class MultiApp:
|
|
46 |
|
47 |
st.sidebar.write(format_func=lambda app: app['title'])
|
48 |
image = Image.open('docStore/img/giz_sdsn_small.jpg')
|
49 |
-
st.sidebar.image(image, width =
|
50 |
|
51 |
with st.sidebar:
|
52 |
selected = option_menu(None, [page["title"] for page in self.apps],
|
|
|
46 |
|
47 |
st.sidebar.write(format_func=lambda app: app['title'])
|
48 |
image = Image.open('docStore/img/giz_sdsn_small.jpg')
|
49 |
+
st.sidebar.image(image, width =200)
|
50 |
|
51 |
with st.sidebar:
|
52 |
selected = option_menu(None, [page["title"] for page in self.apps],
|
appStore/sdg_analysis.py
CHANGED
@@ -93,12 +93,11 @@ def app():
|
|
93 |
file_path = st.session_state['filepath']
|
94 |
classifier = load_sdgClassifier(classifier_name=model_name)
|
95 |
st.session_state['sdg_classifier'] = classifier
|
96 |
-
all_documents = runSDGPreprocessingPipeline(
|
97 |
-
|
98 |
split_length= split_length,
|
99 |
-
split_overlap= split_overlap,
|
100 |
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
101 |
-
remove_punc= remove_punc)
|
102 |
|
103 |
if len(all_documents['documents']) > 100:
|
104 |
warning_msg = ": This might take sometime, please sit back and relax."
|
@@ -110,14 +109,14 @@ def app():
|
|
110 |
df, x = sdg_classification(haystack_doc=all_documents['documents'],
|
111 |
threshold= threshold)
|
112 |
df = df.drop(['Relevancy'], axis = 1)
|
113 |
-
sdg_labels = x.SDG.unique()
|
114 |
textrank_keyword_list = []
|
115 |
for label in sdg_labels:
|
116 |
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
117 |
textranklist_ = textrank(textdata=sdgdata, words= top_n)
|
118 |
if len(textranklist_) > 0:
|
119 |
textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
|
120 |
-
|
121 |
|
122 |
|
123 |
plt.rcParams['font.size'] = 25
|
@@ -145,7 +144,7 @@ def app():
|
|
145 |
st.write("")
|
146 |
st.markdown("###### What keywords are present under SDG classified text? ######")
|
147 |
|
148 |
-
AgGrid(
|
149 |
update_mode="value_changed",
|
150 |
columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
|
151 |
st.write("")
|
|
|
93 |
file_path = st.session_state['filepath']
|
94 |
classifier = load_sdgClassifier(classifier_name=model_name)
|
95 |
st.session_state['sdg_classifier'] = classifier
|
96 |
+
all_documents = runSDGPreprocessingPipeline(file_name= file_name,
|
97 |
+
file_path= file_path, split_by= split_by,
|
98 |
split_length= split_length,
|
|
|
99 |
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
100 |
+
split_overlap= split_overlap, remove_punc= remove_punc)
|
101 |
|
102 |
if len(all_documents['documents']) > 100:
|
103 |
warning_msg = ": This might take sometime, please sit back and relax."
|
|
|
109 |
df, x = sdg_classification(haystack_doc=all_documents['documents'],
|
110 |
threshold= threshold)
|
111 |
df = df.drop(['Relevancy'], axis = 1)
|
112 |
+
sdg_labels = x.SDG.unique()
|
113 |
textrank_keyword_list = []
|
114 |
for label in sdg_labels:
|
115 |
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
116 |
textranklist_ = textrank(textdata=sdgdata, words= top_n)
|
117 |
if len(textranklist_) > 0:
|
118 |
textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
|
119 |
+
textrank_keywords_df = pd.DataFrame(textrank_keyword_list)
|
120 |
|
121 |
|
122 |
plt.rcParams['font.size'] = 25
|
|
|
144 |
st.write("")
|
145 |
st.markdown("###### What keywords are present under SDG classified text? ######")
|
146 |
|
147 |
+
AgGrid(textrank_keywords_df, reload_data = False,
|
148 |
update_mode="value_changed",
|
149 |
columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
|
150 |
st.write("")
|
utils/checkconfig.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1 |
import configparser
|
2 |
import logging
|
3 |
|
4 |
-
def getconfig(
|
|
|
|
|
|
|
5 |
|
6 |
config = configparser.ConfigParser()
|
7 |
|
8 |
try:
|
9 |
-
config.read_file(open(
|
10 |
return config
|
11 |
except:
|
12 |
logging.warning("config file not found")
|
|
|
1 |
import configparser
|
2 |
import logging
|
3 |
|
4 |
+
def getconfig(configfile_path:str):
|
5 |
+
"""
|
6 |
+
configfile_path: file path of .cfg file
|
7 |
+
"""
|
8 |
|
9 |
config = configparser.ConfigParser()
|
10 |
|
11 |
try:
|
12 |
+
config.read_file(open(configfile_path))
|
13 |
return config
|
14 |
except:
|
15 |
logging.warning("config file not found")
|
utils/keyword_extraction.py
CHANGED
@@ -58,7 +58,7 @@ def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
|
|
58 |
return results
|
59 |
|
60 |
|
61 |
-
def tfidf_keyword(textdata, vectorizer, tfidfmodel, top_n):
|
62 |
"""
|
63 |
TFIDF based keywords extraction
|
64 |
|
@@ -108,7 +108,7 @@ def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
|
|
108 |
return keywords
|
109 |
|
110 |
@st.cache(allow_output_mutation=True)
|
111 |
-
def textrank(textdata:Text, ratio:float = 0.1, words = 0):
|
112 |
"""
|
113 |
wrappper function to perform textrank, uses either ratio or wordcount to
|
114 |
extract top keywords limited by words or ratio.
|
|
|
58 |
return results
|
59 |
|
60 |
|
61 |
+
def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
|
62 |
"""
|
63 |
TFIDF based keywords extraction
|
64 |
|
|
|
108 |
return keywords
|
109 |
|
110 |
@st.cache(allow_output_mutation=True)
|
111 |
+
def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
|
112 |
"""
|
113 |
wrappper function to perform textrank, uses either ratio or wordcount to
|
114 |
extract top keywords limited by words or ratio.
|
utils/lexical_search.py
CHANGED
@@ -7,7 +7,7 @@ import streamlit as st
|
|
7 |
from markdown import markdown
|
8 |
from annotated_text import annotation
|
9 |
from haystack.schema import Document
|
10 |
-
from typing import List, Text
|
11 |
from typing_extensions import Literal
|
12 |
from utils.preprocessing import processingpipeline
|
13 |
from utils.streamlitcheck import check_streamlit
|
@@ -23,10 +23,10 @@ except ImportError:
|
|
23 |
logging.info("Streamlit not installed")
|
24 |
|
25 |
|
26 |
-
def runLexicalPreprocessingPipeline(file_path,
|
27 |
split_by: Literal["sentence", "word"] = 'word',
|
28 |
-
split_length:int = 80,
|
29 |
-
|
30 |
"""
|
31 |
creates the pipeline and runs the preprocessing pipeline,
|
32 |
the params for pipeline are fetched from paramconfig. As lexical doesnt gets
|
@@ -40,11 +40,14 @@ def runLexicalPreprocessingPipeline(file_path,file_name,
|
|
40 |
st.session_state['filename']
|
41 |
file_path: filepath, in case of streamlit application use
|
42 |
st.session_state['filepath']
|
43 |
-
removePunc: to remove all Punctuation including ',' and '.' or not
|
44 |
split_by: document splitting strategy either as word or sentence
|
45 |
split_length: when synthetically creating the paragrpahs from document,
|
46 |
it defines the length of paragraph.
|
|
|
|
|
|
|
47 |
splititng of text.
|
|
|
48 |
|
49 |
Return
|
50 |
--------------
|
@@ -91,7 +94,8 @@ def tokenize_lexical_query(query:str)-> List[str]:
|
|
91 |
if not (token.is_stop or token.is_punct)]
|
92 |
return token_list
|
93 |
|
94 |
-
def runSpacyMatcher(token_list:List[str], document:Text
|
|
|
95 |
"""
|
96 |
Using the spacy in backend finds the keywords in the document using the
|
97 |
Matcher class from spacy. We can alternatively use the regex, but spacy
|
@@ -203,7 +207,7 @@ def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
|
|
203 |
else:
|
204 |
print(annotated_text)
|
205 |
|
206 |
-
def lexical_search(query:Text,
|
207 |
"""
|
208 |
Performs the Lexical search on the List of haystack documents which is
|
209 |
returned by preprocessing Pipeline.
|
|
|
7 |
from markdown import markdown
|
8 |
from annotated_text import annotation
|
9 |
from haystack.schema import Document
|
10 |
+
from typing import List, Text, Tuple
|
11 |
from typing_extensions import Literal
|
12 |
from utils.preprocessing import processingpipeline
|
13 |
from utils.streamlitcheck import check_streamlit
|
|
|
23 |
logging.info("Streamlit not installed")
|
24 |
|
25 |
|
26 |
+
def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
|
27 |
split_by: Literal["sentence", "word"] = 'word',
|
28 |
+
split_length:int = 80, split_overlap:int = 0,
|
29 |
+
remove_punc:bool = False,)->List[Document]:
|
30 |
"""
|
31 |
creates the pipeline and runs the preprocessing pipeline,
|
32 |
the params for pipeline are fetched from paramconfig. As lexical doesnt gets
|
|
|
40 |
st.session_state['filename']
|
41 |
file_path: filepath, in case of streamlit application use
|
42 |
st.session_state['filepath']
|
|
|
43 |
split_by: document splitting strategy either as word or sentence
|
44 |
split_length: when synthetically creating the paragrpahs from document,
|
45 |
it defines the length of paragraph.
|
46 |
+
split_overlap: Number of words or sentences that overlap when creating
|
47 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
48 |
+
when read in together with others. Therefore the overlap is used.
|
49 |
splititng of text.
|
50 |
+
removePunc: to remove all Punctuation including ',' and '.' or not
|
51 |
|
52 |
Return
|
53 |
--------------
|
|
|
94 |
if not (token.is_stop or token.is_punct)]
|
95 |
return token_list
|
96 |
|
97 |
+
def runSpacyMatcher(token_list:List[str], document:Text
|
98 |
+
)->Tuple(List[List[int]],spacy.tokens.doc.Doc):
|
99 |
"""
|
100 |
Using the spacy in backend finds the keywords in the document using the
|
101 |
Matcher class from spacy. We can alternatively use the regex, but spacy
|
|
|
207 |
else:
|
208 |
print(annotated_text)
|
209 |
|
210 |
+
def lexical_search(query:Text, documents:List[Document],top_k:int):
|
211 |
"""
|
212 |
Performs the Lexical search on the List of haystack documents which is
|
213 |
returned by preprocessing Pipeline.
|
utils/preprocessing.py
CHANGED
@@ -120,7 +120,7 @@ class FileConverter(BaseComponent):
|
|
120 |
return
|
121 |
|
122 |
|
123 |
-
def basic(s, remove_punc:bool = False):
|
124 |
|
125 |
"""
|
126 |
Performs basic cleaning of text.
|
@@ -164,10 +164,10 @@ class UdfPreProcessor(BaseComponent):
|
|
164 |
"""
|
165 |
outgoing_edges = 1
|
166 |
|
167 |
-
def run(self, documents:List[Document], remove_punc:bool,
|
168 |
split_by: Literal["sentence", "word"] = 'sentence',
|
169 |
-
split_respect_sentence_boundary = False,
|
170 |
-
|
171 |
|
172 |
""" this is required method to invoke the component in
|
173 |
the pipeline implementation.
|
@@ -175,7 +175,7 @@ class UdfPreProcessor(BaseComponent):
|
|
175 |
Params
|
176 |
----------
|
177 |
documents: documents from the output dictionary returned by Fileconverter
|
178 |
-
|
179 |
split_by: document splitting strategy either as word or sentence
|
180 |
split_length: when synthetically creating the paragrpahs from document,
|
181 |
it defines the length of paragraph.
|
|
|
120 |
return
|
121 |
|
122 |
|
123 |
+
def basic(s:str, remove_punc:bool = False):
|
124 |
|
125 |
"""
|
126 |
Performs basic cleaning of text.
|
|
|
164 |
"""
|
165 |
outgoing_edges = 1
|
166 |
|
167 |
+
def run(self, documents:List[Document], remove_punc:bool=False,
|
168 |
split_by: Literal["sentence", "word"] = 'sentence',
|
169 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
170 |
+
split_overlap:int = 0):
|
171 |
|
172 |
""" this is required method to invoke the component in
|
173 |
the pipeline implementation.
|
|
|
175 |
Params
|
176 |
----------
|
177 |
documents: documents from the output dictionary returned by Fileconverter
|
178 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
179 |
split_by: document splitting strategy either as word or sentence
|
180 |
split_length: when synthetically creating the paragrpahs from document,
|
181 |
it defines the length of paragraph.
|
utils/sdg_classifier.py
CHANGED
@@ -34,7 +34,7 @@ _lab_dict = {0: 'no_cat',
|
|
34 |
17:'SDG 17 - Partnership for the goals',}
|
35 |
|
36 |
@st.cache(allow_output_mutation=True)
|
37 |
-
def load_sdgClassifier(config_file = None, classifier_name = None):
|
38 |
"""
|
39 |
loads the document classifier using haystack, where the name/path of model
|
40 |
in HF-hub as string is used to fetch the model object.Either configfile or
|
@@ -44,8 +44,8 @@ def load_sdgClassifier(config_file = None, classifier_name = None):
|
|
44 |
|
45 |
Params
|
46 |
--------
|
47 |
-
|
48 |
-
|
49 |
found then will look for configfile, else raise error.
|
50 |
|
51 |
|
@@ -69,7 +69,9 @@ def load_sdgClassifier(config_file = None, classifier_name = None):
|
|
69 |
|
70 |
@st.cache(allow_output_mutation=True)
|
71 |
def sdg_classification(haystack_doc:List[Document],
|
72 |
-
threshold:float
|
|
|
|
|
73 |
"""
|
74 |
Text-Classification on the list of texts provided. Classifier provides the
|
75 |
most appropriate label for each text. these labels are in terms of if text
|
@@ -77,12 +79,13 @@ def sdg_classification(haystack_doc:List[Document],
|
|
77 |
|
78 |
Params
|
79 |
---------
|
80 |
-
|
81 |
contains the list of paragraphs in different format,here the list of
|
82 |
Haystack Documents is used.
|
83 |
threshold: threshold value for the model to keep the results from classifier
|
84 |
-
classifiermodel: you can pass the classifier model directly,
|
85 |
-
streamlit
|
|
|
86 |
|
87 |
|
88 |
Returns
|
@@ -117,7 +120,7 @@ def sdg_classification(haystack_doc:List[Document],
|
|
117 |
x = x.rename('count')
|
118 |
x = x.rename_axis('SDG').reset_index()
|
119 |
x["SDG"] = pd.to_numeric(x["SDG"])
|
120 |
-
x = x.sort_values(by=['count'])
|
121 |
x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
|
122 |
x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
|
123 |
|
@@ -126,11 +129,10 @@ def sdg_classification(haystack_doc:List[Document],
|
|
126 |
|
127 |
return df, x
|
128 |
|
129 |
-
def runSDGPreprocessingPipeline(
|
130 |
split_by: Literal["sentence", "word"] = 'sentence',
|
131 |
-
split_respect_sentence_boundary = False,
|
132 |
-
|
133 |
-
remove_punc = False)->List[Document]:
|
134 |
"""
|
135 |
creates the pipeline and runs the preprocessing pipeline,
|
136 |
the params for pipeline are fetched from paramconfig
|
@@ -140,13 +142,16 @@ def runSDGPreprocessingPipeline(filePath, fileName,
|
|
140 |
|
141 |
file_name: filename, in case of streamlit application use
|
142 |
st.session_state['filename']
|
143 |
-
file_path: filepath, in case of streamlit application use
|
144 |
-
removePunc: to remove all Punctuation including ',' and '.' or not
|
145 |
split_by: document splitting strategy either as word or sentence
|
146 |
split_length: when synthetically creating the paragrpahs from document,
|
147 |
it defines the length of paragraph.
|
148 |
split_respect_sentence_boundary: Used when using 'word' strategy for
|
149 |
splititng of text.
|
|
|
|
|
|
|
|
|
150 |
|
151 |
|
152 |
Return
|
@@ -160,9 +165,9 @@ def runSDGPreprocessingPipeline(filePath, fileName,
|
|
160 |
|
161 |
sdg_processing_pipeline = processingpipeline()
|
162 |
|
163 |
-
output_sdg_pre = sdg_processing_pipeline.run(file_paths =
|
164 |
-
params= {"FileConverter": {"file_path":
|
165 |
-
"file_name":
|
166 |
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
167 |
"split_by": split_by, \
|
168 |
"split_length":split_length,\
|
|
|
34 |
17:'SDG 17 - Partnership for the goals',}
|
35 |
|
36 |
@st.cache(allow_output_mutation=True)
|
37 |
+
def load_sdgClassifier(config_file:str = None, classifier_name:str = None):
|
38 |
"""
|
39 |
loads the document classifier using haystack, where the name/path of model
|
40 |
in HF-hub as string is used to fetch the model object.Either configfile or
|
|
|
44 |
|
45 |
Params
|
46 |
--------
|
47 |
+
config_file: config file path from which to read the model name
|
48 |
+
classifier_name: if modelname is passed, it takes a priority if not \
|
49 |
found then will look for configfile, else raise error.
|
50 |
|
51 |
|
|
|
69 |
|
70 |
@st.cache(allow_output_mutation=True)
|
71 |
def sdg_classification(haystack_doc:List[Document],
|
72 |
+
threshold:float = 0.8,
|
73 |
+
classifier_model:TransformersDocumentClassifier= None
|
74 |
+
)->Tuple[DataFrame,Series]:
|
75 |
"""
|
76 |
Text-Classification on the list of texts provided. Classifier provides the
|
77 |
most appropriate label for each text. these labels are in terms of if text
|
|
|
79 |
|
80 |
Params
|
81 |
---------
|
82 |
+
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
83 |
contains the list of paragraphs in different format,here the list of
|
84 |
Haystack Documents is used.
|
85 |
threshold: threshold value for the model to keep the results from classifier
|
86 |
+
classifiermodel: you can pass the classifier model directly,which takes priority
|
87 |
+
however if not then looks for model in streamlit session.
|
88 |
+
In case of streamlit avoid passing the model directly.
|
89 |
|
90 |
|
91 |
Returns
|
|
|
120 |
x = x.rename('count')
|
121 |
x = x.rename_axis('SDG').reset_index()
|
122 |
x["SDG"] = pd.to_numeric(x["SDG"])
|
123 |
+
x = x.sort_values(by=['count'], ascending=False)
|
124 |
x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
|
125 |
x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
|
126 |
|
|
|
129 |
|
130 |
return df, x
|
131 |
|
132 |
+
def runSDGPreprocessingPipeline(file_name:str, file_path:str,
|
133 |
split_by: Literal["sentence", "word"] = 'sentence',
|
134 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
135 |
+
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
|
|
136 |
"""
|
137 |
creates the pipeline and runs the preprocessing pipeline,
|
138 |
the params for pipeline are fetched from paramconfig
|
|
|
142 |
|
143 |
file_name: filename, in case of streamlit application use
|
144 |
st.session_state['filename']
|
145 |
+
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
|
|
146 |
split_by: document splitting strategy either as word or sentence
|
147 |
split_length: when synthetically creating the paragrpahs from document,
|
148 |
it defines the length of paragraph.
|
149 |
split_respect_sentence_boundary: Used when using 'word' strategy for
|
150 |
splititng of text.
|
151 |
+
split_overlap: Number of words or sentences that overlap when creating
|
152 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
153 |
+
when read in together with others. Therefore the overlap is used.
|
154 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
155 |
|
156 |
|
157 |
Return
|
|
|
165 |
|
166 |
sdg_processing_pipeline = processingpipeline()
|
167 |
|
168 |
+
output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
169 |
+
params= {"FileConverter": {"file_path": file_path, \
|
170 |
+
"file_name": file_name},
|
171 |
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
172 |
"split_by": split_by, \
|
173 |
"split_length":split_length,\
|
utils/semantic_search.py
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
-
from haystack.nodes import TransformersQueryClassifier
|
2 |
from haystack.nodes import EmbeddingRetriever, FARMReader
|
3 |
from haystack.nodes.base import BaseComponent
|
4 |
from haystack.document_stores import InMemoryDocumentStore
|
5 |
from markdown import markdown
|
6 |
from annotated_text import annotation
|
7 |
from haystack.schema import Document
|
8 |
-
from typing import List, Text
|
9 |
from typing_extensions import Literal
|
10 |
from utils.preprocessing import processingpipeline
|
11 |
from utils.streamlitcheck import check_streamlit
|
12 |
from haystack.pipelines import Pipeline
|
|
|
13 |
import logging
|
14 |
try:
|
15 |
from termcolor import colored
|
@@ -37,9 +38,13 @@ class QueryCheck(BaseComponent):
|
|
37 |
Uses Query Classifier from Haystack, process the query based on query type.
|
38 |
Ability to determine the statements is not so good, therefore the chances
|
39 |
statement also get modified. Ex: "List water related issues" will be
|
40 |
-
identified by the model as keywords, and therefore it be processed as "what
|
41 |
-
the 'list all water related issues' related issues and discussions?".
|
42 |
-
but is igonred for now, as semantic search will not
|
|
|
|
|
|
|
|
|
43 |
|
44 |
1. https://docs.haystack.deepset.ai/docs/query_classifier
|
45 |
|
@@ -47,11 +52,22 @@ class QueryCheck(BaseComponent):
|
|
47 |
|
48 |
outgoing_edges = 1
|
49 |
|
50 |
-
def run(self, query):
|
51 |
"""
|
52 |
-
mandatory method to use the
|
53 |
if the query is of type keyword/statement will modify it to make it more
|
54 |
useful for sentence transoformers.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
"""
|
57 |
query_classifier = loadQueryClassifier()
|
@@ -61,20 +77,51 @@ class QueryCheck(BaseComponent):
|
|
61 |
output = {"query":query,
|
62 |
"query_type": 'question/statement'}
|
63 |
else:
|
64 |
-
output = {"query": "what are the {} related issues and
|
|
|
65 |
"query_type": 'statements/keyword'}
|
66 |
logging.info(output)
|
67 |
return output, "output_1"
|
68 |
|
69 |
-
def run_batch(self,
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
@st.cache(allow_output_mutation=True)
|
73 |
-
def runSemanticPreprocessingPipeline(file_path, file_name,
|
74 |
split_by: Literal["sentence", "word"] = 'sentence',
|
75 |
-
|
76 |
-
|
77 |
-
remove_punc = False)->List[Document]:
|
78 |
"""
|
79 |
creates the pipeline and runs the preprocessing pipeline.
|
80 |
|
@@ -82,22 +129,25 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
|
|
82 |
------------
|
83 |
|
84 |
file_name: filename, in case of streamlit application use
|
85 |
-
|
86 |
file_path: filepath, in case of streamlit application use
|
87 |
-
|
88 |
-
removePunc: to remove all Punctuation including ',' and '.' or not
|
89 |
split_by: document splitting strategy either as word or sentence
|
90 |
split_length: when synthetically creating the paragrpahs from document,
|
91 |
-
|
|
|
|
|
|
|
92 |
split_respect_sentence_boundary: Used when using 'word' strategy for
|
93 |
-
|
|
|
94 |
|
95 |
Return
|
96 |
--------------
|
97 |
List[Document]: When preprocessing pipeline is run, the output dictionary
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
"""
|
103 |
|
@@ -106,7 +156,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
|
|
106 |
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
107 |
params= {"FileConverter": {"file_path": file_path, \
|
108 |
"file_name": file_name},
|
109 |
-
|
110 |
"split_by": split_by, \
|
111 |
"split_length":split_length,\
|
112 |
"split_overlap": split_overlap,
|
@@ -115,10 +165,11 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
|
|
115 |
return output_semantic_pre
|
116 |
|
117 |
|
118 |
-
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
119 |
-
|
|
|
120 |
embedding_layer:int = None, retriever_top_k:int = 10,
|
121 |
-
max_seq_len:int
|
122 |
"""
|
123 |
Returns the Retriever model based on params provided.
|
124 |
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
@@ -129,14 +180,16 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
|
|
129 |
Params
|
130 |
---------
|
131 |
embedding_model: Name of the model to be used for embedding. Check the links
|
132 |
-
|
133 |
-
embedding_model_format: check the github link of Haystack provided in
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
140 |
|
141 |
Return
|
142 |
-------
|
@@ -157,7 +210,8 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
|
|
157 |
st.session_state['retriever'] = retriever
|
158 |
return retriever
|
159 |
|
160 |
-
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
|
|
161 |
def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
|
162 |
embedding_dim:int = 768):
|
163 |
"""
|
@@ -167,11 +221,11 @@ def createDocumentStore(documents:List[Document], similarity:str = 'dot_product'
|
|
167 |
Params
|
168 |
-------
|
169 |
documents: List of haystack document. If using the preprocessing pipeline,
|
170 |
-
|
171 |
similarity: scoring function, can be either 'cosine' or 'dot_product'
|
172 |
embedding_dim: Document store has default value of embedding size = 768, and
|
173 |
-
|
174 |
-
|
175 |
|
176 |
Return
|
177 |
-------
|
@@ -185,13 +239,13 @@ def createDocumentStore(documents:List[Document], similarity:str = 'dot_product'
|
|
185 |
return document_store
|
186 |
|
187 |
|
188 |
-
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
|
|
189 |
def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
):
|
195 |
"""
|
196 |
creates the semantic search pipeline and document Store object from the
|
197 |
list of haystack documents. The top_k for the Reader and Retirever are kept
|
@@ -201,6 +255,14 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
|
|
201 |
and to some extent extractive QA purpose. The purpose of Reader is strictly to
|
202 |
highlight the context for retrieved result and not for QA, however as stated
|
203 |
it can work for QA too in limited sense.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
206 |
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
@@ -208,37 +270,39 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
|
|
208 |
4. https://docs.haystack.deepset.ai/docs/reader
|
209 |
|
210 |
|
211 |
-
|
212 |
Params
|
213 |
----------
|
214 |
documents: list of Haystack Documents, returned by preprocessig pipeline.
|
215 |
embedding_model: Name of the model to be used for embedding. Check the links
|
216 |
-
|
217 |
-
embedding_model_format: check the github link of Haystack provided in
|
|
|
218 |
embedding_layer: check the github link of Haystack provided in documentation
|
|
|
|
|
|
|
219 |
retriever_top_k: Number of Top results to be returned by retriever
|
220 |
reader_model: Name of the model to be used for Reader node in hasyatck
|
221 |
-
|
222 |
reader_top_k: Reader will use retrieved results to further find better matches.
|
223 |
-
|
224 |
-
|
225 |
-
useQueryCheck: Whether to use the querycheck which modifies the query or not.
|
226 |
-
embedding_dim: Document store has default value of embedding size = 768, and
|
227 |
-
update_embeddings method of Docstore cannot infer the embedding size of
|
228 |
-
retiever automaticallu, therefore set this value as per the model card.
|
229 |
max_seq_len:everymodel has max seq len it can handle, check in model card.
|
230 |
-
|
|
|
231 |
|
232 |
|
233 |
Return
|
234 |
---------
|
235 |
semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
|
236 |
-
|
|
|
|
|
237 |
|
238 |
document_store: As retriever can work only with Haystack Document Store, the
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
|
243 |
"""
|
244 |
document_store = createDocumentStore(documents=documents,
|
@@ -248,34 +312,187 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
|
|
248 |
embedding_layer=embedding_layer,
|
249 |
retriever_top_k= retriever_top_k,
|
250 |
document_store = document_store,
|
251 |
-
max_seq_len=max_seq_len)
|
252 |
-
|
253 |
document_store.update_embeddings(retriever)
|
254 |
-
reader = FARMReader(model_name_or_path=reader_model,
|
255 |
-
top_k = reader_top_k, use_gpu=True)
|
256 |
semantic_search_pipeline = Pipeline()
|
257 |
if useQueryCheck and reader_model:
|
258 |
querycheck = QueryCheck()
|
259 |
-
|
260 |
-
|
261 |
-
semantic_search_pipeline.add_node(component =
|
262 |
-
|
|
|
|
|
263 |
semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
|
264 |
inputs= ["EmbeddingRetriever"])
|
|
|
265 |
elif reader_model :
|
266 |
-
|
267 |
-
|
268 |
-
semantic_search_pipeline.add_node(component =
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
else:
|
271 |
-
|
272 |
-
|
273 |
|
|
|
274 |
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
-
def semanticsearchAnnotator(matches:
|
279 |
"""
|
280 |
Annotates the text in the document defined by list of [start index, end index]
|
281 |
Example: "How are you today", if document type is text, matches = [[0,3]]
|
@@ -311,12 +528,14 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
|
|
311 |
print(annotated_text)
|
312 |
|
313 |
|
314 |
-
def semantic_keywordsearch(query:Text,documents:List[Document],
|
|
|
315 |
embedding_model_format:Text,
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
|
|
320 |
"""
|
321 |
Performs the Semantic search on the List of haystack documents which is
|
322 |
returned by preprocessing Pipeline.
|
@@ -327,7 +546,7 @@ def semantic_keywordsearch(query:Text,documents:List[Document],embedding_model:T
|
|
327 |
documents: List fo Haystack documents returned by preprocessing pipeline.
|
328 |
|
329 |
"""
|
330 |
-
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
|
331 |
embedding_model= embedding_model,
|
332 |
embedding_layer= embedding_layer,
|
333 |
embedding_model_format= embedding_model_format,
|
@@ -335,22 +554,24 @@ def semantic_keywordsearch(query:Text,documents:List[Document],embedding_model:T
|
|
335 |
reader_top_k= reader_top_k, embedding_dim=embedding_dim,
|
336 |
max_seq_len=max_seq_len)
|
337 |
|
338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
if return_results:
|
340 |
-
return
|
341 |
else:
|
342 |
if check_streamlit:
|
343 |
st.markdown("##### Top few semantic search results #####")
|
344 |
else:
|
345 |
print("Top few semantic search results")
|
346 |
-
for i
|
347 |
-
temp = answer.to_dict()
|
348 |
-
doc = doc_store.get_document_by_id(temp['document_id']).content
|
349 |
-
start_idx = doc.find(temp['context'])
|
350 |
-
end_idx = start_idx + len(temp['context'])
|
351 |
-
match = [[start_idx,end_idx]]
|
352 |
if check_streamlit:
|
353 |
st.write("Result {}".format(i+1))
|
354 |
else:
|
355 |
print("Result {}".format(i+1))
|
356 |
-
semanticsearchAnnotator(
|
|
|
|
1 |
+
from haystack.nodes import TransformersQueryClassifier, Docs2Answers
|
2 |
from haystack.nodes import EmbeddingRetriever, FARMReader
|
3 |
from haystack.nodes.base import BaseComponent
|
4 |
from haystack.document_stores import InMemoryDocumentStore
|
5 |
from markdown import markdown
|
6 |
from annotated_text import annotation
|
7 |
from haystack.schema import Document
|
8 |
+
from typing import List, Text, Union
|
9 |
from typing_extensions import Literal
|
10 |
from utils.preprocessing import processingpipeline
|
11 |
from utils.streamlitcheck import check_streamlit
|
12 |
from haystack.pipelines import Pipeline
|
13 |
+
import pandas as pd
|
14 |
import logging
|
15 |
try:
|
16 |
from termcolor import colored
|
|
|
38 |
Uses Query Classifier from Haystack, process the query based on query type.
|
39 |
Ability to determine the statements is not so good, therefore the chances
|
40 |
statement also get modified. Ex: "List water related issues" will be
|
41 |
+
identified by the model as keywords, and therefore it be processed as "what
|
42 |
+
are the 'list all water related issues' related issues and discussions?".
|
43 |
+
This is one shortcoming but is igonred for now, as semantic search will not
|
44 |
+
get affected a lot, by this. If you want to pass keywords list and want to
|
45 |
+
do batch processing use. run_batch. Example: if you want to find relevant
|
46 |
+
passages for water, food security, poverty then querylist = ["water", "food
|
47 |
+
security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
|
48 |
|
49 |
1. https://docs.haystack.deepset.ai/docs/query_classifier
|
50 |
|
|
|
52 |
|
53 |
outgoing_edges = 1
|
54 |
|
55 |
+
def run(self, query:str):
|
56 |
"""
|
57 |
+
mandatory method to use the custom node. Determines the query type, if
|
58 |
if the query is of type keyword/statement will modify it to make it more
|
59 |
useful for sentence transoformers.
|
60 |
+
|
61 |
+
Params
|
62 |
+
--------
|
63 |
+
query: query/statement/keywords in form of string
|
64 |
+
|
65 |
+
Return
|
66 |
+
------
|
67 |
+
output: dictionary, with key as identifier and value could be anything
|
68 |
+
we need to return. In this case the output contain key = 'query'.
|
69 |
+
|
70 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
71 |
|
72 |
"""
|
73 |
query_classifier = loadQueryClassifier()
|
|
|
77 |
output = {"query":query,
|
78 |
"query_type": 'question/statement'}
|
79 |
else:
|
80 |
+
output = {"query": "what are the {} related issues and \
|
81 |
+
discussions?".format(query),
|
82 |
"query_type": 'statements/keyword'}
|
83 |
logging.info(output)
|
84 |
return output, "output_1"
|
85 |
|
86 |
+
def run_batch(self, queries:List[str]):
|
87 |
+
"""
|
88 |
+
running multiple queries in one go, howeevr need the queries to be passed
|
89 |
+
as list of string. Example: if you want to find relevant passages for
|
90 |
+
water, food security, poverty then querylist = ["water", "food security",
|
91 |
+
"poverty"] and then execute QueryCheck.run_batch(queries = querylist)
|
92 |
+
|
93 |
+
Params
|
94 |
+
--------
|
95 |
+
queries: queries/statements/keywords in form of string encapsulated
|
96 |
+
within List
|
97 |
+
|
98 |
+
Return
|
99 |
+
------
|
100 |
+
output: dictionary, with key as identifier and value could be anything
|
101 |
+
we need to return. In this case the output contain key = 'queries'.
|
102 |
+
|
103 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
104 |
+
"""
|
105 |
+
query_classifier = loadQueryClassifier()
|
106 |
+
query_list = []
|
107 |
+
for query in queries:
|
108 |
+
result = query_classifier.run(query=query)
|
109 |
+
if result[1] == "output_1":
|
110 |
+
query_list.append(query)
|
111 |
+
else:
|
112 |
+
query_list.append("what are the {} related issues and \
|
113 |
+
discussions?".format(query))
|
114 |
+
output = {'queries':query_list}
|
115 |
+
logging.info(output)
|
116 |
+
return output, "output_1"
|
117 |
+
|
118 |
|
119 |
@st.cache(allow_output_mutation=True)
|
120 |
+
def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
|
121 |
split_by: Literal["sentence", "word"] = 'sentence',
|
122 |
+
split_length:int = 2, split_overlap:int = 0,
|
123 |
+
split_respect_sentence_boundary:bool = False,
|
124 |
+
remove_punc:bool = False)->List[Document]:
|
125 |
"""
|
126 |
creates the pipeline and runs the preprocessing pipeline.
|
127 |
|
|
|
129 |
------------
|
130 |
|
131 |
file_name: filename, in case of streamlit application use
|
132 |
+
st.session_state['filename']
|
133 |
file_path: filepath, in case of streamlit application use
|
134 |
+
st.session_state['filepath']
|
|
|
135 |
split_by: document splitting strategy either as word or sentence
|
136 |
split_length: when synthetically creating the paragrpahs from document,
|
137 |
+
it defines the length of paragraph.
|
138 |
+
split_overlap: Number of words or sentences that overlap when creating the
|
139 |
+
paragraphs. This is done as one sentence or 'some words' make sense
|
140 |
+
when read in together with others. Therefore the overlap is used.
|
141 |
split_respect_sentence_boundary: Used when using 'word' strategy for
|
142 |
+
splititng of text.
|
143 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
144 |
|
145 |
Return
|
146 |
--------------
|
147 |
List[Document]: When preprocessing pipeline is run, the output dictionary
|
148 |
+
has four objects. For the Haysatck implementation of semantic search we,
|
149 |
+
need to use the List of Haystack Document, which can be fetched by
|
150 |
+
key = 'documents' on output.
|
151 |
|
152 |
"""
|
153 |
|
|
|
156 |
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
157 |
params= {"FileConverter": {"file_path": file_path, \
|
158 |
"file_name": file_name},
|
159 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
160 |
"split_by": split_by, \
|
161 |
"split_length":split_length,\
|
162 |
"split_overlap": split_overlap,
|
|
|
165 |
return output_semantic_pre
|
166 |
|
167 |
|
168 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
169 |
+
allow_output_mutation=True)
|
170 |
+
def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
|
171 |
embedding_layer:int = None, retriever_top_k:int = 10,
|
172 |
+
max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
|
173 |
"""
|
174 |
Returns the Retriever model based on params provided.
|
175 |
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
|
|
180 |
Params
|
181 |
---------
|
182 |
embedding_model: Name of the model to be used for embedding. Check the links
|
183 |
+
provided in documentation
|
184 |
+
embedding_model_format: check the github link of Haystack provided in
|
185 |
+
documentation embedding_layer: check the github link of Haystack
|
186 |
+
provided in documentation retriever_top_k: Number of Top results to
|
187 |
+
be returned by
|
188 |
+
retriever max_seq_len: everymodel has max seq len it can handle, check in
|
189 |
+
model card. Needed to hanlde the edge cases.
|
190 |
+
document_store: InMemoryDocumentStore, write haystack Document list to
|
191 |
+
DocumentStore and pass the same to function call. Can be done using
|
192 |
+
createDocumentStore from utils.
|
193 |
|
194 |
Return
|
195 |
-------
|
|
|
210 |
st.session_state['retriever'] = retriever
|
211 |
return retriever
|
212 |
|
213 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
214 |
+
allow_output_mutation=True)
|
215 |
def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
|
216 |
embedding_dim:int = 768):
|
217 |
"""
|
|
|
221 |
Params
|
222 |
-------
|
223 |
documents: List of haystack document. If using the preprocessing pipeline,
|
224 |
+
can be fetched key = 'documents; on output of preprocessing pipeline.
|
225 |
similarity: scoring function, can be either 'cosine' or 'dot_product'
|
226 |
embedding_dim: Document store has default value of embedding size = 768, and
|
227 |
+
update_embeddings method of Docstore cannot infer the embedding size of
|
228 |
+
retiever automatically, therefore set this value as per the model card.
|
229 |
|
230 |
Return
|
231 |
-------
|
|
|
239 |
return document_store
|
240 |
|
241 |
|
242 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
243 |
+
allow_output_mutation=True)
|
244 |
def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
|
245 |
+
embedding_model_format:Text = None,embedding_layer:int = None,
|
246 |
+
embedding_dim:int = 768,retriever_top_k:int = 10,
|
247 |
+
reader_model:str = None, reader_top_k:int = 10,
|
248 |
+
max_seq_len:int =512,useQueryCheck = True, ):
|
|
|
249 |
"""
|
250 |
creates the semantic search pipeline and document Store object from the
|
251 |
list of haystack documents. The top_k for the Reader and Retirever are kept
|
|
|
255 |
and to some extent extractive QA purpose. The purpose of Reader is strictly to
|
256 |
highlight the context for retrieved result and not for QA, however as stated
|
257 |
it can work for QA too in limited sense.
|
258 |
+
There are 4 variants of pipeline it can return
|
259 |
+
1.QueryCheck > Retriever > Reader
|
260 |
+
2.Retriever > Reader
|
261 |
+
3.QueryCheck > Retriever > Docs2Answers : If reader is None,
|
262 |
+
then Doc2answer is used to keep the output of pipeline structurally same.
|
263 |
+
4.Retriever > Docs2Answers
|
264 |
+
|
265 |
+
Links
|
266 |
|
267 |
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
268 |
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
|
|
270 |
4. https://docs.haystack.deepset.ai/docs/reader
|
271 |
|
272 |
|
|
|
273 |
Params
|
274 |
----------
|
275 |
documents: list of Haystack Documents, returned by preprocessig pipeline.
|
276 |
embedding_model: Name of the model to be used for embedding. Check the links
|
277 |
+
provided in documentation
|
278 |
+
embedding_model_format: check the github link of Haystack provided in
|
279 |
+
documentation
|
280 |
embedding_layer: check the github link of Haystack provided in documentation
|
281 |
+
embedding_dim: Document store has default value of embedding size = 768, and
|
282 |
+
update_embeddings method of Docstore cannot infer the embedding size of
|
283 |
+
retiever automatically, therefore set this value as per the model card.
|
284 |
retriever_top_k: Number of Top results to be returned by retriever
|
285 |
reader_model: Name of the model to be used for Reader node in hasyatck
|
286 |
+
Pipeline. Check the links provided in documentation
|
287 |
reader_top_k: Reader will use retrieved results to further find better matches.
|
288 |
+
As purpose here is to use reader to extract context, the value is
|
289 |
+
same as retriever_top_k.
|
|
|
|
|
|
|
|
|
290 |
max_seq_len:everymodel has max seq len it can handle, check in model card.
|
291 |
+
Needed to hanlde the edge cases
|
292 |
+
useQueryCheck: Whether to use the querycheck which modifies the query or not.
|
293 |
|
294 |
|
295 |
Return
|
296 |
---------
|
297 |
semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
|
298 |
+
nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
|
299 |
+
then Doc2answer is used to keep the output of pipeline structurally
|
300 |
+
same.
|
301 |
|
302 |
document_store: As retriever can work only with Haystack Document Store, the
|
303 |
+
list of document returned by preprocessing pipeline are fed into to
|
304 |
+
get InMemmoryDocumentStore object type, with retriever updating the
|
305 |
+
embeddings of each paragraph in document store.
|
306 |
|
307 |
"""
|
308 |
document_store = createDocumentStore(documents=documents,
|
|
|
312 |
embedding_layer=embedding_layer,
|
313 |
retriever_top_k= retriever_top_k,
|
314 |
document_store = document_store,
|
315 |
+
max_seq_len=max_seq_len)
|
|
|
316 |
document_store.update_embeddings(retriever)
|
|
|
|
|
317 |
semantic_search_pipeline = Pipeline()
|
318 |
if useQueryCheck and reader_model:
|
319 |
querycheck = QueryCheck()
|
320 |
+
reader = FARMReader(model_name_or_path=reader_model,
|
321 |
+
top_k = reader_top_k, use_gpu=True)
|
322 |
+
semantic_search_pipeline.add_node(component = querycheck,
|
323 |
+
name = "QueryCheck",inputs = ["Query"])
|
324 |
+
semantic_search_pipeline.add_node(component = retriever,
|
325 |
+
name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
|
326 |
semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
|
327 |
inputs= ["EmbeddingRetriever"])
|
328 |
+
|
329 |
elif reader_model :
|
330 |
+
reader = FARMReader(model_name_or_path=reader_model,
|
331 |
+
top_k = reader_top_k, use_gpu=True)
|
332 |
+
semantic_search_pipeline.add_node(component = retriever,
|
333 |
+
name = "EmbeddingRetriever",inputs = ["Query"])
|
334 |
+
semantic_search_pipeline.add_node(component = reader,
|
335 |
+
name = "FARMReader",inputs= ["EmbeddingRetriever"])
|
336 |
+
elif useQueryCheck and not reader_model:
|
337 |
+
querycheck = QueryCheck()
|
338 |
+
docs2answers = Docs2Answers()
|
339 |
+
semantic_search_pipeline.add_node(component = querycheck,
|
340 |
+
name = "QueryCheck",inputs = ["Query"])
|
341 |
+
semantic_search_pipeline.add_node(component = retriever,
|
342 |
+
name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
|
343 |
+
semantic_search_pipeline.add_node(component = docs2answers,
|
344 |
+
name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
|
345 |
+
elif not useQueryCheck and not reader_model:
|
346 |
+
docs2answers = Docs2Answers()
|
347 |
+
semantic_search_pipeline.add_node(component = retriever,
|
348 |
+
name = "EmbeddingRetriever",inputs = ["Query"])
|
349 |
+
semantic_search_pipeline.add_node(component = docs2answers,
|
350 |
+
name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
|
351 |
+
|
352 |
+
logging.info(semantic_search_pipeline.components)
|
353 |
+
return semantic_search_pipeline, document_store
|
354 |
+
|
355 |
+
def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
|
356 |
+
"""
|
357 |
+
will use the haystack run or run_batch based on if single query is passed
|
358 |
+
as string or multiple queries as List[str]
|
359 |
+
|
360 |
+
Params
|
361 |
+
-------
|
362 |
+
pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
|
363 |
+
from utils.semanticsearch
|
364 |
+
|
365 |
+
queries: Either a single query or list of queries.
|
366 |
+
|
367 |
+
Return
|
368 |
+
-------
|
369 |
+
results: Dict containing answers and documents as key and their respective
|
370 |
+
values
|
371 |
+
|
372 |
+
"""
|
373 |
+
|
374 |
+
if type(queries) == list:
|
375 |
+
results = pipeline.run_batch(queries=queries)
|
376 |
+
elif type(queries) == str:
|
377 |
+
results = pipeline.run(query=queries)
|
378 |
else:
|
379 |
+
logging.info("Please check the input type for the queries")
|
380 |
+
return
|
381 |
|
382 |
+
return results
|
383 |
|
384 |
+
def process_query_output(results:dict)->pd.DataFrame:
|
385 |
+
"""
|
386 |
+
Returns the dataframe with necessary information like including
|
387 |
+
['query','answer','answer_offset','context_offset','context','content',
|
388 |
+
'reader_score','retriever_score','id',]. This is designed for output given
|
389 |
+
by semantic search pipeline with single query and final node as reader.
|
390 |
+
The output of pipeline having Docs2Answers as final node or multiple queries
|
391 |
+
need to be handled separately. In these other cases, use process_semantic_output
|
392 |
+
from utils.semantic_search which uses this function internally to make one
|
393 |
+
combined dataframe.
|
394 |
+
|
395 |
+
Params
|
396 |
+
---------
|
397 |
+
results: this dictionary should have key,values with
|
398 |
+
keys = [query,answers,documents], however answers is optional.
|
399 |
+
in case of [Doc2Answers as final node], process_semantic_output
|
400 |
+
doesnt return answers thereby setting all values contained in
|
401 |
+
answers to 'None'
|
402 |
+
|
403 |
+
Return
|
404 |
+
--------
|
405 |
+
df: dataframe with all the columns mentioned in function description.
|
406 |
+
|
407 |
+
"""
|
408 |
+
query_text = results['query']
|
409 |
+
if 'answers' in results.keys():
|
410 |
+
answer_dict = {}
|
411 |
|
412 |
+
for answer in results['answers']:
|
413 |
+
answer_dict[answer.document_id] = answer.to_dict()
|
414 |
+
else:
|
415 |
+
answer_dict = {}
|
416 |
+
docs = results['documents']
|
417 |
+
df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
|
418 |
+
'context','content','reader_score','retriever_score',
|
419 |
+
'id'])
|
420 |
+
for doc in docs:
|
421 |
+
row_list = {}
|
422 |
+
row_list['query'] = query_text
|
423 |
+
row_list['retriever_score'] = doc.score
|
424 |
+
row_list['id'] = doc.id
|
425 |
+
row_list['content'] = doc.content
|
426 |
+
if doc.id in answer_dict.keys():
|
427 |
+
row_list['answer'] = answer_dict[doc.id]['answer']
|
428 |
+
row_list['context'] = answer_dict[doc.id]['context']
|
429 |
+
row_list['reader_score'] = answer_dict[doc.id]['score']
|
430 |
+
answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
|
431 |
+
row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
|
432 |
+
start_idx = doc.content.find(row_list['context'])
|
433 |
+
end_idx = start_idx + len(row_list['context'])
|
434 |
+
row_list['context_offset'] = [start_idx, end_idx]
|
435 |
+
else:
|
436 |
+
row_list['answer'] = None
|
437 |
+
row_list['context'] = None
|
438 |
+
row_list['reader_score'] = None
|
439 |
+
row_list['answer_offset'] = None
|
440 |
+
row_list['context_offset'] = None
|
441 |
+
df_dictionary = pd.DataFrame([row_list])
|
442 |
+
df = pd.concat([df, df_dictionary], ignore_index=True)
|
443 |
+
|
444 |
+
return df
|
445 |
+
|
446 |
+
def process_semantic_output(results):
|
447 |
+
"""
|
448 |
+
Returns the dataframe with necessary information like including
|
449 |
+
['query','answer','answer_offset','context_offset','context','content',
|
450 |
+
'reader_score','retriever_score','id',]. Distingushes if its single query or
|
451 |
+
multi queries by reading the pipeline output dictionary keys.
|
452 |
+
Uses the process_query_output to get the dataframe for each query and create
|
453 |
+
one concataneted dataframe. In case f Docs2Answers as final node, deletes
|
454 |
+
the answers part. See documentations of process_query_output.
|
455 |
+
|
456 |
+
Params
|
457 |
+
---------
|
458 |
+
results: raw output of runSemanticPipeline.
|
459 |
+
|
460 |
+
Return
|
461 |
+
--------
|
462 |
+
df: dataframe with all the columns mentioned in function description.
|
463 |
+
|
464 |
+
"""
|
465 |
+
output = {}
|
466 |
+
if 'query' in results.keys():
|
467 |
+
output['query'] = results['query']
|
468 |
+
output['documents'] = results['documents']
|
469 |
+
if results['node_id'] == 'Docs2Answers':
|
470 |
+
pass
|
471 |
+
else:
|
472 |
+
output['answers'] = results['answers']
|
473 |
+
df = process_query_output(output)
|
474 |
+
return df
|
475 |
+
if 'queries' in results.keys():
|
476 |
+
df = pd.DataFrame(columns=['query','answer','answer_offset',
|
477 |
+
'context_offset','context','content',
|
478 |
+
'reader_score','retriever_score','id'])
|
479 |
+
for query,answers,documents in zip(results['queries'],
|
480 |
+
results['answers'],results['documents']):
|
481 |
+
output = {}
|
482 |
+
output['query'] = query
|
483 |
+
output['documents'] = documents
|
484 |
+
if results['node_id'] == 'Docs2Answers':
|
485 |
+
pass
|
486 |
+
else:
|
487 |
+
output['answers'] = answers
|
488 |
+
|
489 |
+
temp = process_query_output(output)
|
490 |
+
df = pd.concat([df, temp], ignore_index=True)
|
491 |
+
|
492 |
+
|
493 |
+
return df
|
494 |
|
495 |
+
def semanticsearchAnnotator(matches:List[List[int]], document:Text):
|
496 |
"""
|
497 |
Annotates the text in the document defined by list of [start index, end index]
|
498 |
Example: "How are you today", if document type is text, matches = [[0,3]]
|
|
|
528 |
print(annotated_text)
|
529 |
|
530 |
|
531 |
+
def semantic_keywordsearch(query:Text,documents:List[Document],
|
532 |
+
embedding_model:Text,
|
533 |
embedding_model_format:Text,
|
534 |
+
embedding_layer:int, reader_model:str,
|
535 |
+
retriever_top_k:int = 10, reader_top_k:int = 10,
|
536 |
+
return_results:bool = False, embedding_dim:int = 768,
|
537 |
+
max_seq_len:int = 512,
|
538 |
+
sort_by:Literal["retriever", "reader"] = 'retriever'):
|
539 |
"""
|
540 |
Performs the Semantic search on the List of haystack documents which is
|
541 |
returned by preprocessing Pipeline.
|
|
|
546 |
documents: List fo Haystack documents returned by preprocessing pipeline.
|
547 |
|
548 |
"""
|
549 |
+
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
|
550 |
embedding_model= embedding_model,
|
551 |
embedding_layer= embedding_layer,
|
552 |
embedding_model_format= embedding_model_format,
|
|
|
554 |
reader_top_k= reader_top_k, embedding_dim=embedding_dim,
|
555 |
max_seq_len=max_seq_len)
|
556 |
|
557 |
+
raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
|
558 |
+
results_df = process_semantic_output(raw_output)
|
559 |
+
if sort_by == 'retriever':
|
560 |
+
results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
|
561 |
+
else:
|
562 |
+
results_df = results_df.sort_values(by=['reader_score'], ascending=False)
|
563 |
+
|
564 |
if return_results:
|
565 |
+
return results_df
|
566 |
else:
|
567 |
if check_streamlit:
|
568 |
st.markdown("##### Top few semantic search results #####")
|
569 |
else:
|
570 |
print("Top few semantic search results")
|
571 |
+
for i in range(len(results_df)):
|
|
|
|
|
|
|
|
|
|
|
572 |
if check_streamlit:
|
573 |
st.write("Result {}".format(i+1))
|
574 |
else:
|
575 |
print("Result {}".format(i+1))
|
576 |
+
semanticsearchAnnotator(results_df.loc[i]['context_offset'],
|
577 |
+
results_df.loc[i]['content'] )
|