Spaces:
GIZ
/
Running on CPU Upgrade

prashant commited on
Commit
43cd965
1 Parent(s): 3a88079

search UI changes

Browse files
appStore/keyword_search.py CHANGED
@@ -14,7 +14,8 @@ config = getconfig('paramconfig.cfg')
14
  split_by = config.get('semantic_search','SPLIT_BY')
15
  split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
16
  split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
17
- split_respect_sentence_boundary = bool(int(config.get('semantic_search','RESPECT_SENTENCE_BOUNDARY')))
 
18
  remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
19
  embedding_model = config.get('semantic_search','RETRIEVER')
20
  embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
@@ -22,6 +23,11 @@ embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
22
  retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
23
  reader_model = config.get('semantic_search','READER')
24
  reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
 
 
 
 
 
25
 
26
  def app():
27
 
@@ -49,22 +55,23 @@ def app():
49
  keywordexample = json.load(json_file)
50
 
51
  genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
52
- if genre == 'Food':
53
- keywordList = keywordexample['Food']
54
- elif genre == 'Climate':
55
- keywordList = keywordexample['Climate']
56
- elif genre == 'Social':
57
- keywordList = keywordexample['Social']
58
- elif genre == 'Nature':
59
- keywordList = keywordexample['Nature']
60
- elif genre == 'Implementation':
61
- keywordList = keywordexample['Implementation']
62
  else:
63
  keywordList = None
64
 
65
- searchtype = st.selectbox("Do you want to find exact macthes or similar \
66
- meaning/context",
67
- ['Exact Matches', 'Similar context/meaning'])
 
68
 
69
  st.markdown("---")
70
 
@@ -80,7 +87,7 @@ def app():
80
  for and we will we will look for similar\
81
  context in the document.",
82
  placeholder="Enter keyword here")
83
-
84
  if st.button("Find them"):
85
 
86
  if queryList == "":
@@ -91,16 +98,22 @@ def app():
91
  if 'filepath' in st.session_state:
92
 
93
 
94
- if searchtype == 'Exact Matches':
95
- # allDocuments = runLexicalPreprocessingPipeline(
96
- # st.session_state['filepath'],
97
- # st.session_state['filename'])
98
- # logging.info("performing lexical search")
99
- # with st.spinner("Performing Exact matching search \
100
- # (Lexical search) for you"):
101
- # st.markdown("##### Top few lexical search (TFIDF) hits #####")
102
- # lexical_search(queryList,allDocuments['documents'])
103
- pass
 
 
 
 
 
 
104
  else:
105
  allDocuments = runSemanticPreprocessingPipeline(
106
  file_path= st.session_state['filepath'],
@@ -109,7 +122,7 @@ def app():
109
  split_length= split_length,
110
  split_overlap=split_overlap,
111
  removePunc= remove_punc,
112
- split_respect_sentence_boundary=split_respect_sentence_boundary)
113
 
114
 
115
  logging.info("starting semantic search")
@@ -120,7 +133,6 @@ def app():
120
  embedding_layer=embedding_layer,
121
  embedding_model_format=embedding_model_format,
122
  reader_model=reader_model,reader_top_k=reader_top_k,
123
-
124
  retriever_top_k=retriever_top_k)
125
 
126
  else:
 
14
  split_by = config.get('semantic_search','SPLIT_BY')
15
  split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
16
  split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
17
+ split_respect_sentence_boundary = bool(int(config.get('semantic_search',
18
+ 'RESPECT_SENTENCE_BOUNDARY')))
19
  remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
20
  embedding_model = config.get('semantic_search','RETRIEVER')
21
  embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
 
23
  retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
24
  reader_model = config.get('semantic_search','READER')
25
  reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
26
+ lexical_split_by= config.get('lexical_search','SPLIT_BY')
27
+ lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
28
+ lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
29
+ lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC')))
30
+ lexical_top_k=int(config.get('lexical_search','TOP_K'))
31
 
32
  def app():
33
 
 
55
  keywordexample = json.load(json_file)
56
 
57
  genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
58
+ if genre:
59
+ keywordList = keywordexample[genre]
60
+ # elif genre == 'Climate':
61
+ # keywordList = keywordexample['Climate']
62
+ # elif genre == 'Social':
63
+ # keywordList = keywordexample['Social']
64
+ # elif genre == 'Nature':
65
+ # keywordList = keywordexample['Nature']
66
+ # elif genre == 'Implementation':
67
+ # keywordList = keywordexample['Implementation']
68
  else:
69
  keywordList = None
70
 
71
+ # searchtype = st.selectbox("Do you want to find exact macthes or similar \
72
+ # meaning/context",
73
+ # ['Exact Matches', 'Similar context/meaning'])
74
+
75
 
76
  st.markdown("---")
77
 
 
87
  for and we will we will look for similar\
88
  context in the document.",
89
  placeholder="Enter keyword here")
90
+ searchtype = st.checkbox("Show only Exact Matches")
91
  if st.button("Find them"):
92
 
93
  if queryList == "":
 
98
  if 'filepath' in st.session_state:
99
 
100
 
101
+ if searchtype:
102
+ allDocuments = runLexicalPreprocessingPipeline(
103
+ file_name=st.session_state['filename'],
104
+ file_path=st.session_state['filepath'],
105
+ split_by=lexical_split_by,
106
+ split_length=lexical_split_length,
107
+ split_overlap=lexical_split_overlap,
108
+ removePunc=lexical_remove_punc),
109
+ logging.info("performing lexical search")
110
+ with st.spinner("Performing Exact matching search \
111
+ (Lexical search) for you"):
112
+ st.markdown("##### Top few lexical search (TFIDF) hits #####")
113
+ lexical_search(
114
+ query=queryList,
115
+ documents = allDocuments['documents'],
116
+ top_k = lexical_top_k )
117
  else:
118
  allDocuments = runSemanticPreprocessingPipeline(
119
  file_path= st.session_state['filepath'],
 
122
  split_length= split_length,
123
  split_overlap=split_overlap,
124
  removePunc= remove_punc,
125
+ split_respect_sentence_boundary=split_respect_sentence_boundary)
126
 
127
 
128
  logging.info("starting semantic search")
 
133
  embedding_layer=embedding_layer,
134
  embedding_model_format=embedding_model_format,
135
  reader_model=reader_model,reader_top_k=reader_top_k,
 
136
  retriever_top_k=retriever_top_k)
137
 
138
  else:
docStore/sample/keywordexample.json CHANGED
@@ -1,4 +1,4 @@
1
- {"I will enter my own keyword":[],
2
  "Food":"Food security,Nutrition,Diets,Food loss",
3
  "Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
4
  "Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority",
 
1
+ {
2
  "Food":"Food security,Nutrition,Diets,Food loss",
3
  "Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
4
  "Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority",
paramconfig.cfg CHANGED
@@ -1,8 +1,9 @@
1
  [lexical_search]
2
  TOP_K = 20
3
- SPLIT_BY = sentence
4
- SPLIT_LENGTH = 3
5
  SPLIT_OVERLAP = 0
 
6
 
7
  [semantic_search]
8
  RETRIEVER_TOP_K = 10
 
1
  [lexical_search]
2
  TOP_K = 20
3
+ SPLIT_BY = word
4
+ SPLIT_LENGTH = 120
5
  SPLIT_OVERLAP = 0
6
+ REMOVE_PUNC = 0
7
 
8
  [semantic_search]
9
  RETRIEVER_TOP_K = 10
utils/lexical_search.py CHANGED
@@ -8,9 +8,9 @@ from markdown import markdown
8
  from annotated_text import annotation
9
  from haystack.schema import Document
10
  from typing import List, Text
 
11
  from utils.preprocessing import processingpipeline
12
  from utils.streamlitcheck import check_streamlit
13
- import configparser
14
  import logging
15
  try:
16
  from termcolor import colored
@@ -21,18 +21,17 @@ try:
21
  import streamlit as st
22
  except ImportError:
23
  logging.info("Streamlit not installed")
24
- config = configparser.ConfigParser()
25
- try:
26
- config.read_file(open('paramconfig.cfg'))
27
- except Exception:
28
- logging.warning("paramconfig file not found")
29
- st.info("Please place the paramconfig file in the same directory as app.py")
30
 
31
 
32
- def runLexicalPreprocessingPipeline(file_path, file_name)->List[Document]:
 
 
 
33
  """
34
  creates the pipeline and runs the preprocessing pipeline,
35
- the params for pipeline are fetched from paramconfig
 
 
36
 
37
  Params
38
  ------------
@@ -41,6 +40,11 @@ def runLexicalPreprocessingPipeline(file_path, file_name)->List[Document]:
41
  st.session_state['filename']
42
  file_path: filepath, in case of streamlit application use
43
  st.session_state['filepath']
 
 
 
 
 
44
 
45
  Return
46
  --------------
@@ -52,14 +56,12 @@ def runLexicalPreprocessingPipeline(file_path, file_name)->List[Document]:
52
  """
53
 
54
  lexical_processing_pipeline = processingpipeline()
55
- split_by = config.get('lexical_search','SPLIT_BY')
56
- split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
57
- split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
58
 
59
  output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
60
  params= {"FileConverter": {"file_path": file_path, \
61
  "file_name": file_name},
62
- "UdfPreProcessor": {"removePunc": False, \
63
  "split_by": split_by, \
64
  "split_length":split_length,\
65
  "split_overlap": split_overlap}})
@@ -201,7 +203,7 @@ def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
201
  else:
202
  print(annotated_text)
203
 
204
- def lexical_search(query:Text,documents:List[Document]):
205
  """
206
  Performs the Lexical search on the List of haystack documents which is
207
  returned by preprocessing Pipeline.
@@ -210,6 +212,7 @@ def lexical_search(query:Text,documents:List[Document]):
210
  -------
211
  query: Keywords that need to be searche in documents.
212
  documents: List of Haystack documents returned by preprocessing pipeline.
 
213
 
214
  """
215
 
@@ -218,8 +221,7 @@ def lexical_search(query:Text,documents:List[Document]):
218
 
219
  # Haystack Retriever works with document stores only.
220
  retriever = TfidfRetriever(document_store)
221
- results = retriever.retrieve(query=query,
222
- top_k= int(config.get('lexical_search','TOP_K')))
223
  query_tokens = tokenize_lexical_query(query)
224
  for count, result in enumerate(results):
225
  matches, doc = runSpacyMatcher(query_tokens,result.content)
 
8
  from annotated_text import annotation
9
  from haystack.schema import Document
10
  from typing import List, Text
11
+ from typing_extensions import Literal
12
  from utils.preprocessing import processingpipeline
13
  from utils.streamlitcheck import check_streamlit
 
14
  import logging
15
  try:
16
  from termcolor import colored
 
21
  import streamlit as st
22
  except ImportError:
23
  logging.info("Streamlit not installed")
 
 
 
 
 
 
24
 
25
 
26
+ def runLexicalPreprocessingPipeline(file_path,file_name,
27
+ split_by: Literal["sentence", "word"] = 'word',
28
+ split_length:int = 80, removePunc:bool = False,
29
+ split_overlap:int = 0 )->List[Document]:
30
  """
31
  creates the pipeline and runs the preprocessing pipeline,
32
+ the params for pipeline are fetched from paramconfig. As lexical doesnt gets
33
+ affected by overlap, threfore split_overlap = 0 in default paramconfig and
34
+ split_by = word.
35
 
36
  Params
37
  ------------
 
40
  st.session_state['filename']
41
  file_path: filepath, in case of streamlit application use
42
  st.session_state['filepath']
43
+ removePunc: to remove all Punctuation including ',' and '.' or not
44
+ split_by: document splitting strategy either as word or sentence
45
+ split_length: when synthetically creating the paragrpahs from document,
46
+ it defines the length of paragraph.
47
+ splititng of text.
48
 
49
  Return
50
  --------------
 
56
  """
57
 
58
  lexical_processing_pipeline = processingpipeline()
59
+
 
 
60
 
61
  output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
62
  params= {"FileConverter": {"file_path": file_path, \
63
  "file_name": file_name},
64
+ "UdfPreProcessor": {"removePunc": removePunc, \
65
  "split_by": split_by, \
66
  "split_length":split_length,\
67
  "split_overlap": split_overlap}})
 
203
  else:
204
  print(annotated_text)
205
 
206
+ def lexical_search(query:Text,top_k:int, documents:List[Document]):
207
  """
208
  Performs the Lexical search on the List of haystack documents which is
209
  returned by preprocessing Pipeline.
 
212
  -------
213
  query: Keywords that need to be searche in documents.
214
  documents: List of Haystack documents returned by preprocessing pipeline.
215
+ top_k: Number of Top results to be fetched.
216
 
217
  """
218
 
 
221
 
222
  # Haystack Retriever works with document stores only.
223
  retriever = TfidfRetriever(document_store)
224
+ results = retriever.retrieve(query=query, top_k = top_k)
 
225
  query_tokens = tokenize_lexical_query(query)
226
  for count, result in enumerate(results):
227
  matches, doc = runSpacyMatcher(query_tokens,result.content)
utils/preprocessing.py CHANGED
@@ -167,7 +167,7 @@ class UdfPreProcessor(BaseComponent):
167
  def run(self, documents:List[Document], removePunc:bool,
168
  split_by: Literal["sentence", "word"] = 'sentence',
169
  split_respect_sentence_boundary = False,
170
- split_length:int = 2, split_overlap = 0):
171
 
172
  """ this is required method to invoke the component in
173
  the pipeline implementation.
@@ -181,6 +181,9 @@ class UdfPreProcessor(BaseComponent):
181
  it defines the length of paragraph.
182
  split_respect_sentence_boundary: Used when using 'word' strategy for
183
  splititng of text.
 
 
 
184
 
185
  Return
186
  ---------
 
167
  def run(self, documents:List[Document], removePunc:bool,
168
  split_by: Literal["sentence", "word"] = 'sentence',
169
  split_respect_sentence_boundary = False,
170
+ split_length:int = 2, split_overlap:int = 0):
171
 
172
  """ this is required method to invoke the component in
173
  the pipeline implementation.
 
181
  it defines the length of paragraph.
182
  split_respect_sentence_boundary: Used when using 'word' strategy for
183
  splititng of text.
184
+ split_overlap: Number of words or sentences that overlap when creating
185
+ the paragraphs. This is done as one sentence or 'some words' make sense
186
+ when read in together with others. Therefore the overlap is used.
187
 
188
  Return
189
  ---------
utils/semantic_search.py CHANGED
@@ -34,7 +34,13 @@ def loadQueryClassifier():
34
 
35
  class QueryCheck(BaseComponent):
36
  """
37
- Uses Query Classifier from Haystack, process the query based on query type
 
 
 
 
 
 
38
  1. https://docs.haystack.deepset.ai/docs/query_classifier
39
 
40
  """
@@ -69,8 +75,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
69
  split_length:int = 2, split_overlap = 0,
70
  removePunc = False)->List[Document]:
71
  """
72
- creates the pipeline and runs the preprocessing pipeline,
73
- the params for pipeline are fetched from paramconfig
74
 
75
  Params
76
  ------------
@@ -132,7 +137,7 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
132
 
133
  Return
134
  -------
135
- retriever: emebedding model
136
  """
137
  logging.info("loading retriever")
138
  if document_store is None:
@@ -151,7 +156,7 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
151
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
152
  def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
153
  """
154
- Creates the InMemory Document Store frm haystack list of Documents.
155
  It is mandatory component for Retriever to work in Haystack frame work.
156
 
157
  Params
@@ -167,10 +172,6 @@ def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
167
  """
168
  document_store = InMemoryDocumentStore(similarity = similarity)
169
  document_store.write_documents(documents)
170
- # if check_streamlit:
171
- # if 'retriever' in st.session_state:
172
- # retriever = st.session_state['retriever']
173
- # document_store.update_embeddings(retriever)
174
 
175
  return document_store
176
 
@@ -182,11 +183,10 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
182
  reader_model:str = None, reader_top_k:int = 10):
183
  """
184
  creates the semantic search pipeline and document Store object from the
185
- list of haystack documents. Retriever and Reader model are read from
186
- paramconfig. The top_k for the Reader and Retirever are kept same, so that
187
- all the results returned by Retriever are used, however the context is
188
- extracted by Reader for each retrieved result. The querycheck is added as
189
- node to process the query.
190
  1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
191
  2. https://www.sbert.net/examples/applications/semantic-search/README.html
192
  3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
@@ -214,50 +214,22 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
214
  nodes [QueryCheck, Retriever, Reader]
215
 
216
  document_store: As retriever can work only with Haystack Document Store, the
217
- list of document returned by preprocessing pipeline.
 
 
218
 
219
  """
220
- document_store = createDocumentStore(documents)
221
- # if check_streamlit:
222
- # if 'retriever' in st.session_state:
223
- # # if st.session_state['retriever']:
224
- # retriever = st.session_state['retriever']
225
- # else:
226
- # if embedding_model:
227
  retriever = loadRetriever(embedding_model = embedding_model,
228
  embedding_model_format=embedding_model_format,
229
  embedding_layer=embedding_layer,
230
  retriever_top_k= retriever_top_k,
231
  document_store = document_store)
232
 
233
- # st.session_state['retriever'] = retriever
234
- # else:
235
- # logging.warning("no streamlit enviornment found, neither embedding model \
236
- # provided")
237
- # return
238
- # elif embedding_model:
239
- # retriever = loadRetriever(embedding_model = embedding_model,
240
- # embedding_model_format=embedding_model_format,
241
- # embedding_layer=embedding_layer,
242
- # retriever_top_k= retriever_top_k,
243
- # document_store = document_store)
244
-
245
-
246
  document_store.update_embeddings(retriever)
247
- # retriever.document_store = document_store
248
  querycheck = QueryCheck()
249
- # if check_streamlit:
250
- # if 'reader' in st.session_state:
251
- # reader = st.session_state['reader']
252
-
253
- # else:
254
- # if reader_model:
255
  reader = FARMReader(model_name_or_path=reader_model,
256
  top_k = reader_top_k, use_gpu=True)
257
- # st.session_state['reader'] = reader
258
- # elif reader_model:
259
- # reader = FARMReader(model_name_or_path=reader_model,
260
- # top_k = reader_top_k, use_gpu=True)
261
 
262
  semanticsearch_pipeline = Pipeline()
263
  semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
@@ -339,84 +311,8 @@ def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
339
  end_idx = temp['offsets_in_document'][0]['end']
340
  match = [[start_idx,end_idx]]
341
  doc = doc_store.get_document_by_id(temp['document_id']).content
342
- st.write("Result {}".format(i+1))
343
- semanticsearchAnnotator(match, doc)
344
-
345
-
346
-
347
- # if 'document_store' in st.session_state:
348
- # document_store = st.session_state['document_store']
349
- # temp = document_store.get_all_documents()
350
- # if st.session_state['filename'] != temp[0].meta['name']:
351
-
352
- # document_store = InMemoryDocumentStore()
353
- # document_store.write_documents(documents)
354
- # if 'retriever' in st.session_state:
355
- # retriever = st.session_state['retriever']
356
- # document_store.update_embeddings(retriever)
357
- # # querycheck =
358
-
359
-
360
- # # embedding_model = config.get('semantic_search','RETRIEVER')
361
- # # embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
362
- # # embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
363
- # # retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
364
- # # retriever = EmbeddingRetriever(
365
- # # document_store=document_store,
366
- # # embedding_model=embedding_model,top_k = retriever_top_k,
367
- # # emb_extraction_layer=embedding_layer, scale_score =True,
368
- # # model_format=embedding_model_format, use_gpu = True)
369
- # # document_store.update_embeddings(retriever)
370
- # else:
371
- # embedding_model = config.get('semantic_search','RETRIEVER')
372
- # embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
373
- # retriever = EmbeddingRetriever(
374
- # document_store=document_store,
375
- # embedding_model=embedding_model,top_k = retriever_top_k,
376
- # emb_extraction_layer=embedding_layer, scale_score =True,
377
- # model_format=embedding_model_format, use_gpu = True)
378
-
379
- # else:
380
- # document_store = InMemoryDocumentStore()
381
- # document_store.write_documents(documents)
382
-
383
- # embedding_model = config.get('semantic_search','RETRIEVER')
384
- # embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
385
- # embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
386
- # retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
387
-
388
-
389
- # retriever = EmbeddingRetriever(
390
- # document_store=document_store,
391
- # embedding_model=embedding_model,top_k = retriever_top_k,
392
- # emb_extraction_layer=embedding_layer, scale_score =True,
393
- # model_format=embedding_model_format, use_gpu = True)
394
- # st.session_state['retriever'] = retriever
395
- # document_store.update_embeddings(retriever)
396
- # st.session_state['document_store'] = document_store
397
- # querycheck = QueryCheck()
398
- # st.session_state['querycheck'] = querycheck
399
- # reader_model = config.get('semantic_search','READER')
400
- # reader_top_k = retriever_top_k
401
- # reader = FARMReader(model_name_or_path=reader_model,
402
- # top_k = reader_top_k, use_gpu=True)
403
-
404
- # st.session_state['reader'] = reader
405
-
406
- # querycheck = QueryCheck()
407
-
408
- # reader_model = config.get('semantic_search','READER')
409
- # reader_top_k = retriever_top_k
410
- # reader = FARMReader(model_name_or_path=reader_model,
411
- # top_k = reader_top_k, use_gpu=True)
412
-
413
-
414
- # semanticsearch_pipeline = Pipeline()
415
- # semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
416
- # inputs = ["Query"])
417
- # semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
418
- # inputs = ["QueryCheck.output_1"])
419
- # semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
420
- # inputs= ["EmbeddingRetriever"])
421
-
422
- # return semanticsearch_pipeline, document_store
 
34
 
35
  class QueryCheck(BaseComponent):
36
  """
37
+ Uses Query Classifier from Haystack, process the query based on query type.
38
+ Ability to determine the statements is not so good, therefore the chances
39
+ statement also get modified. Ex: "List water related issues" will be
40
+ identified by the model as keywords, and therefore it be processed as "find
41
+ all issues related to 'list all water related issues'". This is one shortcoming
42
+ but is igonred for now, as semantic search will not get affected a lot, by this.
43
+
44
  1. https://docs.haystack.deepset.ai/docs/query_classifier
45
 
46
  """
 
75
  split_length:int = 2, split_overlap = 0,
76
  removePunc = False)->List[Document]:
77
  """
78
+ creates the pipeline and runs the preprocessing pipeline.
 
79
 
80
  Params
81
  ------------
 
137
 
138
  Return
139
  -------
140
+ retriever: embedding model
141
  """
142
  logging.info("loading retriever")
143
  if document_store is None:
 
156
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
157
  def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
158
  """
159
+ Creates the InMemory Document Store from haystack list of Documents.
160
  It is mandatory component for Retriever to work in Haystack frame work.
161
 
162
  Params
 
172
  """
173
  document_store = InMemoryDocumentStore(similarity = similarity)
174
  document_store.write_documents(documents)
 
 
 
 
175
 
176
  return document_store
177
 
 
183
  reader_model:str = None, reader_top_k:int = 10):
184
  """
185
  creates the semantic search pipeline and document Store object from the
186
+ list of haystack documents. The top_k for the Reader and Retirever are kept
187
+ same, so that all the results returned by Retriever are used, however the
188
+ context is extracted by Reader for each retrieved result. The querycheck is
189
+ added as node to process the query.
 
190
  1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
191
  2. https://www.sbert.net/examples/applications/semantic-search/README.html
192
  3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
 
214
  nodes [QueryCheck, Retriever, Reader]
215
 
216
  document_store: As retriever can work only with Haystack Document Store, the
217
+ list of document returned by preprocessing pipeline are fed into to get
218
+ InMemmoryDocumentStore object type, with retriever updating the embedding
219
+ embeddings of each paragraph in document store.
220
 
221
  """
222
+ document_store = createDocumentStore(documents)
 
 
 
 
 
 
223
  retriever = loadRetriever(embedding_model = embedding_model,
224
  embedding_model_format=embedding_model_format,
225
  embedding_layer=embedding_layer,
226
  retriever_top_k= retriever_top_k,
227
  document_store = document_store)
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  document_store.update_embeddings(retriever)
 
230
  querycheck = QueryCheck()
 
 
 
 
 
 
231
  reader = FARMReader(model_name_or_path=reader_model,
232
  top_k = reader_top_k, use_gpu=True)
 
 
 
 
233
 
234
  semanticsearch_pipeline = Pipeline()
235
  semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
 
311
  end_idx = temp['offsets_in_document'][0]['end']
312
  match = [[start_idx,end_idx]]
313
  doc = doc_store.get_document_by_id(temp['document_id']).content
314
+ if check_streamlit:
315
+ st.write("Result {}".format(i+1))
316
+ else:
317
+ print("Result {}".format(i+1))
318
+ semanticsearchAnnotator(match, doc)