prashant
commited on
Commit
•
63da636
1
Parent(s):
40cb026
update semantic search
Browse files- utils/{search.py → lexical_search.py} +18 -146
- utils/semantic_search.py +172 -0
utils/{search.py → lexical_search.py}
RENAMED
@@ -1,5 +1,4 @@
|
|
1 |
-
from haystack.nodes import TfidfRetriever
|
2 |
-
from haystack.nodes import EmbeddingRetriever, FARMReader
|
3 |
from haystack.nodes.base import BaseComponent
|
4 |
from haystack.document_stores import InMemoryDocumentStore
|
5 |
import configparser
|
@@ -101,7 +100,7 @@ def runRegexMatcher(token_list:List[str], document:Text):
|
|
101 |
|
102 |
return matches, document
|
103 |
|
104 |
-
def
|
105 |
"""
|
106 |
Annotates the text in the document defined by list of [start index, end index]
|
107 |
Example: "How are you today", if document type is text, matches = [[0,3]]
|
@@ -127,27 +126,6 @@ def searchAnnotator(matches: List[List[int]], document):
|
|
127 |
unsafe_allow_html=True,
|
128 |
)
|
129 |
|
130 |
-
def lexical_search(query:Text,documents:List[Document]):
|
131 |
-
"""
|
132 |
-
Performs the Lexical search on the List of haystack documents which is
|
133 |
-
returned by preprocessing Pipeline.
|
134 |
-
"""
|
135 |
-
|
136 |
-
document_store = InMemoryDocumentStore()
|
137 |
-
document_store.write_documents(documents)
|
138 |
-
|
139 |
-
# Haystack Retriever works with document stores only.
|
140 |
-
retriever = TfidfRetriever(document_store)
|
141 |
-
results = retriever.retrieve(query=query,
|
142 |
-
top_k= int(config.get('lexical_search','TOP_K')))
|
143 |
-
query_tokens = tokenize_lexical_query(query)
|
144 |
-
for count, result in enumerate(results):
|
145 |
-
# if result.content != "":
|
146 |
-
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
147 |
-
if len(matches) != 0:
|
148 |
-
st.write("Result {}".format(count+1))
|
149 |
-
searchAnnotator(matches, doc)
|
150 |
-
|
151 |
def runLexicalPreprocessingPipeline()->List[Document]:
|
152 |
"""
|
153 |
creates the pipeline and runs the preprocessing pipeline,
|
@@ -177,131 +155,25 @@ def runLexicalPreprocessingPipeline()->List[Document]:
|
|
177 |
"split_overlap": split_overlap}})
|
178 |
|
179 |
return output_lexical_pre['documents']
|
180 |
-
|
181 |
-
def runSemanticPreprocessingPipeline()->List[Document]:
|
182 |
-
"""
|
183 |
-
creates the pipeline and runs the preprocessing pipeline,
|
184 |
-
the params for pipeline are fetched from paramconfig
|
185 |
-
|
186 |
-
Return
|
187 |
-
--------------
|
188 |
-
List[Document]: When preprocessing pipeline is run, the output dictionary
|
189 |
-
has four objects. For the Haysatck implementation of semantic search we,
|
190 |
-
need to use the List of Haystack Document, which can be fetched by
|
191 |
-
key = 'documents' on output.
|
192 |
-
|
193 |
-
"""
|
194 |
-
file_path = st.session_state['filepath']
|
195 |
-
file_name = st.session_state['filename']
|
196 |
-
semantic_processing_pipeline = processingpipeline()
|
197 |
-
split_by = config.get('semantic_search','SPLIT_BY')
|
198 |
-
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
199 |
-
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
|
200 |
-
|
201 |
-
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
202 |
-
params= {"FileConverter": {"file_path": file_path, \
|
203 |
-
"file_name": file_name},
|
204 |
-
"UdfPreProcessor": {"removePunc": False, \
|
205 |
-
"split_by": split_by, \
|
206 |
-
"split_length":split_length,\
|
207 |
-
"split_overlap": split_overlap}})
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
class QueryCheck(BaseComponent):
|
212 |
-
|
213 |
-
outgoing_edges = 1
|
214 |
-
|
215 |
-
def run(self, query):
|
216 |
-
|
217 |
-
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
218 |
-
"shahrukhx01/bert-mini-finetune-question-detection")
|
219 |
-
|
220 |
-
|
221 |
-
result = query_classifier.run(query=query)
|
222 |
-
|
223 |
-
if result[1] == "output_1":
|
224 |
-
output = {"query":query,
|
225 |
-
"query_type": 'question/statement'}
|
226 |
-
else:
|
227 |
-
output = {"query": "find all issues related to {}".format(query),
|
228 |
-
"query_type": 'statements/keyword'}
|
229 |
-
|
230 |
-
return output, "output_1"
|
231 |
-
|
232 |
-
def run_batch(self, query):
|
233 |
-
pass
|
234 |
-
|
235 |
-
|
236 |
-
def semanticSearchPipeline(documents, show_answers = False):
|
237 |
-
document_store = InMemoryDocumentStore()
|
238 |
-
document_store.write_documents(documents)
|
239 |
-
|
240 |
-
embedding_model = config.get('semantic_search','RETRIEVER')
|
241 |
-
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
242 |
-
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
243 |
-
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
querycheck = QueryCheck()
|
248 |
-
retriever = EmbeddingRetriever(
|
249 |
-
document_store=document_store,
|
250 |
-
embedding_model=embedding_model,top_k = retriever_top_k,
|
251 |
-
emb_extraction_layer=embedding_layer, scale_score =True,
|
252 |
-
model_format=embedding_model_format, use_gpu = True)
|
253 |
-
document_store.update_embeddings(retriever)
|
254 |
-
|
255 |
-
|
256 |
-
semanticsearch_pipeline = Pipeline()
|
257 |
-
semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
|
258 |
-
inputs = ["Query"])
|
259 |
-
semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
|
260 |
-
inputs = ["QueryCheck.output_1"])
|
261 |
-
if show_answers == True:
|
262 |
-
reader_model = config.get('semantic_search','READER')
|
263 |
-
reader_top_k = retriever_top_k
|
264 |
-
reader = FARMReader(model_name_or_path=reader_model,
|
265 |
-
top_k = reader_top_k, use_gpu=True)
|
266 |
-
|
267 |
-
semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
|
268 |
-
inputs= ["EmbeddingRetriever"])
|
269 |
-
|
270 |
-
return semanticsearch_pipeline, document_store
|
271 |
-
|
272 |
-
def semantic_search(query:Text,documents:List[Document],show_answers = False):
|
273 |
"""
|
274 |
Performs the Lexical search on the List of haystack documents which is
|
275 |
returned by preprocessing Pipeline.
|
276 |
"""
|
277 |
-
threshold = 0.4
|
278 |
-
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
|
279 |
-
show_answers=show_answers)
|
280 |
-
results = semanticsearch_pipeline.run(query = query)
|
281 |
-
|
282 |
-
|
283 |
-
if show_answers == False:
|
284 |
-
results = results['documents']
|
285 |
-
for i,queryhit in enumerate(results):
|
286 |
-
|
287 |
-
if queryhit.score > threshold:
|
288 |
-
st.write("\t {}: \t {}".format(i+1, queryhit.content.replace("\n", " ")))
|
289 |
-
st.markdown("---")
|
290 |
-
|
291 |
-
else:
|
292 |
-
|
293 |
-
for answer in results['answers']:
|
294 |
-
st.write(answer)
|
295 |
-
matches = []
|
296 |
-
doc = []
|
297 |
-
if answer.score >0.01:
|
298 |
-
temp = answer.to_dict()
|
299 |
-
start_idx = temp['offsets_in_document'][0]['start']
|
300 |
-
end_idx = temp['offsets_in_document'][0]['end']
|
301 |
-
|
302 |
-
matches.append([start_idx,end_idx])
|
303 |
-
doc.append(doc_store.get_document_by_id(temp['document_id']).content)
|
304 |
-
searchAnnotator(matches,doc)
|
305 |
-
|
306 |
-
|
307 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes import TfidfRetriever
|
|
|
2 |
from haystack.nodes.base import BaseComponent
|
3 |
from haystack.document_stores import InMemoryDocumentStore
|
4 |
import configparser
|
|
|
100 |
|
101 |
return matches, document
|
102 |
|
103 |
+
def lexicalsearchAnnotator(matches: List[List[int]], document):
|
104 |
"""
|
105 |
Annotates the text in the document defined by list of [start index, end index]
|
106 |
Example: "How are you today", if document type is text, matches = [[0,3]]
|
|
|
126 |
unsafe_allow_html=True,
|
127 |
)
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
def runLexicalPreprocessingPipeline()->List[Document]:
|
130 |
"""
|
131 |
creates the pipeline and runs the preprocessing pipeline,
|
|
|
155 |
"split_overlap": split_overlap}})
|
156 |
|
157 |
return output_lexical_pre['documents']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
+
def lexical_search(query:Text,documents:List[Document]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
"""
|
161 |
Performs the Lexical search on the List of haystack documents which is
|
162 |
returned by preprocessing Pipeline.
|
163 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
+
document_store = InMemoryDocumentStore()
|
166 |
+
document_store.write_documents(documents)
|
167 |
+
|
168 |
+
# Haystack Retriever works with document stores only.
|
169 |
+
retriever = TfidfRetriever(document_store)
|
170 |
+
results = retriever.retrieve(query=query,
|
171 |
+
top_k= int(config.get('lexical_search','TOP_K')))
|
172 |
+
query_tokens = tokenize_lexical_query(query)
|
173 |
+
for count, result in enumerate(results):
|
174 |
+
# if result.content != "":
|
175 |
+
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
176 |
+
if len(matches) != 0:
|
177 |
+
st.write("Result {}".format(count+1))
|
178 |
+
lexicalsearchAnnotator(matches, doc)
|
179 |
+
|
utils/semantic_search.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes import TransformersQueryClassifier
|
2 |
+
from haystack.nodes import EmbeddingRetriever, FARMReader
|
3 |
+
from haystack.nodes.base import BaseComponent
|
4 |
+
from haystack.document_stores import InMemoryDocumentStore
|
5 |
+
import configparser
|
6 |
+
import streamlit as st
|
7 |
+
from markdown import markdown
|
8 |
+
from annotated_text import annotation
|
9 |
+
from haystack.schema import Document
|
10 |
+
from typing import List, Text
|
11 |
+
from utils.preprocessing import processingpipeline
|
12 |
+
from haystack.pipelines import Pipeline
|
13 |
+
|
14 |
+
config = configparser.ConfigParser()
|
15 |
+
config.read_file(open('paramconfig.cfg'))
|
16 |
+
|
17 |
+
class QueryCheck(BaseComponent):
|
18 |
+
|
19 |
+
outgoing_edges = 1
|
20 |
+
|
21 |
+
def run(self, query):
|
22 |
+
|
23 |
+
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
24 |
+
"shahrukhx01/bert-mini-finetune-question-detection")
|
25 |
+
|
26 |
+
|
27 |
+
result = query_classifier.run(query=query)
|
28 |
+
|
29 |
+
if result[1] == "output_1":
|
30 |
+
output = {"query":query,
|
31 |
+
"query_type": 'question/statement'}
|
32 |
+
else:
|
33 |
+
output = {"query": "find all issues related to {}".format(query),
|
34 |
+
"query_type": 'statements/keyword'}
|
35 |
+
|
36 |
+
return output, "output_1"
|
37 |
+
|
38 |
+
def run_batch(self, query):
|
39 |
+
pass
|
40 |
+
|
41 |
+
def runSemanticPreprocessingPipeline()->List[Document]:
|
42 |
+
"""
|
43 |
+
creates the pipeline and runs the preprocessing pipeline,
|
44 |
+
the params for pipeline are fetched from paramconfig
|
45 |
+
|
46 |
+
Return
|
47 |
+
--------------
|
48 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
49 |
+
has four objects. For the Haysatck implementation of semantic search we,
|
50 |
+
need to use the List of Haystack Document, which can be fetched by
|
51 |
+
key = 'documents' on output.
|
52 |
+
|
53 |
+
"""
|
54 |
+
file_path = st.session_state['filepath']
|
55 |
+
file_name = st.session_state['filename']
|
56 |
+
semantic_processing_pipeline = processingpipeline()
|
57 |
+
split_by = config.get('semantic_search','SPLIT_BY')
|
58 |
+
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
59 |
+
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
|
60 |
+
|
61 |
+
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
62 |
+
params= {"FileConverter": {"file_path": file_path, \
|
63 |
+
"file_name": file_name},
|
64 |
+
"UdfPreProcessor": {"removePunc": False, \
|
65 |
+
"split_by": split_by, \
|
66 |
+
"split_length":split_length,\
|
67 |
+
"split_overlap": split_overlap}})
|
68 |
+
|
69 |
+
return output_semantic_pre['documents']
|
70 |
+
|
71 |
+
|
72 |
+
def semanticSearchPipeline(documents, show_answers = False):
|
73 |
+
document_store = InMemoryDocumentStore()
|
74 |
+
document_store.write_documents(documents)
|
75 |
+
|
76 |
+
embedding_model = config.get('semantic_search','RETRIEVER')
|
77 |
+
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
78 |
+
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
79 |
+
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
querycheck = QueryCheck()
|
84 |
+
retriever = EmbeddingRetriever(
|
85 |
+
document_store=document_store,
|
86 |
+
embedding_model=embedding_model,top_k = retriever_top_k,
|
87 |
+
emb_extraction_layer=embedding_layer, scale_score =True,
|
88 |
+
model_format=embedding_model_format, use_gpu = True)
|
89 |
+
document_store.update_embeddings(retriever)
|
90 |
+
|
91 |
+
|
92 |
+
semanticsearch_pipeline = Pipeline()
|
93 |
+
semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
|
94 |
+
inputs = ["Query"])
|
95 |
+
semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
|
96 |
+
inputs = ["QueryCheck.output_1"])
|
97 |
+
if show_answers == True:
|
98 |
+
reader_model = config.get('semantic_search','READER')
|
99 |
+
reader_top_k = retriever_top_k
|
100 |
+
reader = FARMReader(model_name_or_path=reader_model,
|
101 |
+
top_k = reader_top_k, use_gpu=True)
|
102 |
+
|
103 |
+
semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
|
104 |
+
inputs= ["EmbeddingRetriever"])
|
105 |
+
|
106 |
+
return semanticsearch_pipeline, document_store
|
107 |
+
|
108 |
+
def semanticsearchAnnotator(matches: List[List[int]], document):
|
109 |
+
"""
|
110 |
+
Annotates the text in the document defined by list of [start index, end index]
|
111 |
+
Example: "How are you today", if document type is text, matches = [[0,3]]
|
112 |
+
will give answer = "How", however in case we used the spacy matcher then the
|
113 |
+
matches = [[0,3]] will give answer = "How are you". However if spacy is used
|
114 |
+
to find "How" then the matches = [[0,1]] for the string defined above.
|
115 |
+
|
116 |
+
"""
|
117 |
+
start = 0
|
118 |
+
annotated_text = ""
|
119 |
+
for match in matches:
|
120 |
+
start_idx = match[0]
|
121 |
+
end_idx = match[1]
|
122 |
+
annotated_text = (annotated_text + document[start:start_idx].text
|
123 |
+
+ str(annotation(body=document[start_idx:end_idx].text,
|
124 |
+
label="ANSWER", background="#964448", color='#ffffff')))
|
125 |
+
start = end_idx
|
126 |
+
|
127 |
+
annotated_text = annotated_text + document[end_idx:].text
|
128 |
+
|
129 |
+
st.write(
|
130 |
+
markdown(annotated_text),
|
131 |
+
unsafe_allow_html=True,
|
132 |
+
)
|
133 |
+
|
134 |
+
|
135 |
+
def semantic_search(query:Text,documents:List[Document],show_answers = False):
|
136 |
+
"""
|
137 |
+
Performs the Lexical search on the List of haystack documents which is
|
138 |
+
returned by preprocessing Pipeline.
|
139 |
+
"""
|
140 |
+
threshold = 0.4
|
141 |
+
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
|
142 |
+
show_answers=show_answers)
|
143 |
+
results = semanticsearch_pipeline.run(query = query)
|
144 |
+
|
145 |
+
|
146 |
+
if show_answers == False:
|
147 |
+
results = results['documents']
|
148 |
+
for i,queryhit in enumerate(results):
|
149 |
+
|
150 |
+
if queryhit.score > threshold:
|
151 |
+
st.write("\t {}: \t {}".format(i+1, queryhit.content.replace("\n", " ")))
|
152 |
+
st.markdown("---")
|
153 |
+
|
154 |
+
else:
|
155 |
+
|
156 |
+
for answer in results['answers']:
|
157 |
+
st.write(answer)
|
158 |
+
# matches = []
|
159 |
+
# doc = []
|
160 |
+
if answer.score >0.01:
|
161 |
+
temp = answer.to_dict()
|
162 |
+
start_idx = temp['offsets_in_document'][0]['start']
|
163 |
+
end_idx = temp['offsets_in_document'][0]['end']
|
164 |
+
|
165 |
+
# matches.append([start_idx,end_idx])
|
166 |
+
# doc.append(doc_store.get_document_by_id(temp['document_id']).content)
|
167 |
+
match = [[start_idx,end_idx]]
|
168 |
+
doc = doc_store.get_document_by_id(temp['document_id']).content
|
169 |
+
semanticsearchAnnotator(match,doc)
|
170 |
+
|
171 |
+
|
172 |
+
|