prashant
commited on
Commit
•
2370cfa
1
Parent(s):
eaa8795
adding reader top)k per candidate
Browse files- appStore/keyword_search.py +3 -1
- paramconfig.cfg +1 -0
- utils/semantic_search.py +8 -4
appStore/keyword_search.py
CHANGED
@@ -26,6 +26,7 @@ max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH'))
|
|
26 |
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
27 |
reader_model = config.get('semantic_search','READER')
|
28 |
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
|
|
29 |
lexical_split_by= config.get('lexical_search','SPLIT_BY')
|
30 |
lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
|
31 |
lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
@@ -149,7 +150,8 @@ def app():
|
|
149 |
embedding_model_format=embedding_model_format,
|
150 |
reader_model=reader_model,reader_top_k=reader_top_k,
|
151 |
retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
|
152 |
-
max_seq_len=max_seq_len
|
|
|
153 |
|
154 |
else:
|
155 |
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
|
|
26 |
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
27 |
reader_model = config.get('semantic_search','READER')
|
28 |
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
29 |
+
top_k_per_candidate = int(config.get('semantic_search','READER_TOP_K_PER_CANDIDATE'))
|
30 |
lexical_split_by= config.get('lexical_search','SPLIT_BY')
|
31 |
lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
|
32 |
lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
|
|
150 |
embedding_model_format=embedding_model_format,
|
151 |
reader_model=reader_model,reader_top_k=reader_top_k,
|
152 |
retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
|
153 |
+
max_seq_len=max_seq_len,
|
154 |
+
top_k_per_candidate = top_k_per_candidate)
|
155 |
|
156 |
else:
|
157 |
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
paramconfig.cfg
CHANGED
@@ -14,6 +14,7 @@ EMBEDDING_DIM = 768
|
|
14 |
RETRIEVER_EMB_LAYER = -1
|
15 |
READER = deepset/tinyroberta-squad2
|
16 |
READER_TOP_K = 10
|
|
|
17 |
SPLIT_BY = word
|
18 |
SPLIT_LENGTH = 120
|
19 |
SPLIT_OVERLAP = 10
|
|
|
14 |
RETRIEVER_EMB_LAYER = -1
|
15 |
READER = deepset/tinyroberta-squad2
|
16 |
READER_TOP_K = 10
|
17 |
+
READER_TOP_K_PER_CANDIDATE = 1
|
18 |
SPLIT_BY = word
|
19 |
SPLIT_LENGTH = 120
|
20 |
SPLIT_OVERLAP = 10
|
utils/semantic_search.py
CHANGED
@@ -245,7 +245,8 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
|
|
245 |
embedding_model_format:Text = None,embedding_layer:int = None,
|
246 |
embedding_dim:int = 768,retriever_top_k:int = 10,
|
247 |
reader_model:str = None, reader_top_k:int = 10,
|
248 |
-
max_seq_len:int =512,useQueryCheck = True,
|
|
|
249 |
"""
|
250 |
creates the semantic search pipeline and document Store object from the
|
251 |
list of haystack documents. The top_k for the Reader and Retirever are kept
|
@@ -290,7 +291,8 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
|
|
290 |
max_seq_len:everymodel has max seq len it can handle, check in model card.
|
291 |
Needed to hanlde the edge cases
|
292 |
useQueryCheck: Whether to use the querycheck which modifies the query or not.
|
293 |
-
|
|
|
294 |
|
295 |
Return
|
296 |
---------
|
@@ -318,7 +320,8 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
|
|
318 |
if useQueryCheck and reader_model:
|
319 |
querycheck = QueryCheck()
|
320 |
reader = FARMReader(model_name_or_path=reader_model,
|
321 |
-
top_k = reader_top_k, use_gpu=True
|
|
|
322 |
semantic_search_pipeline.add_node(component = querycheck,
|
323 |
name = "QueryCheck",inputs = ["Query"])
|
324 |
semantic_search_pipeline.add_node(component = retriever,
|
@@ -328,7 +331,8 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
|
|
328 |
|
329 |
elif reader_model :
|
330 |
reader = FARMReader(model_name_or_path=reader_model,
|
331 |
-
top_k = reader_top_k, use_gpu=True
|
|
|
332 |
semantic_search_pipeline.add_node(component = retriever,
|
333 |
name = "EmbeddingRetriever",inputs = ["Query"])
|
334 |
semantic_search_pipeline.add_node(component = reader,
|
|
|
245 |
embedding_model_format:Text = None,embedding_layer:int = None,
|
246 |
embedding_dim:int = 768,retriever_top_k:int = 10,
|
247 |
reader_model:str = None, reader_top_k:int = 10,
|
248 |
+
max_seq_len:int =512,useQueryCheck = True,
|
249 |
+
top_k_per_candidate:int = 1):
|
250 |
"""
|
251 |
creates the semantic search pipeline and document Store object from the
|
252 |
list of haystack documents. The top_k for the Reader and Retirever are kept
|
|
|
291 |
max_seq_len:everymodel has max seq len it can handle, check in model card.
|
292 |
Needed to hanlde the edge cases
|
293 |
useQueryCheck: Whether to use the querycheck which modifies the query or not.
|
294 |
+
top_k_per_candidate:How many answers to extract for each candidate doc
|
295 |
+
that is coming from the retriever
|
296 |
|
297 |
Return
|
298 |
---------
|
|
|
320 |
if useQueryCheck and reader_model:
|
321 |
querycheck = QueryCheck()
|
322 |
reader = FARMReader(model_name_or_path=reader_model,
|
323 |
+
top_k = reader_top_k, use_gpu=True,
|
324 |
+
top_k_per_candidate = top_k_per_candidate)
|
325 |
semantic_search_pipeline.add_node(component = querycheck,
|
326 |
name = "QueryCheck",inputs = ["Query"])
|
327 |
semantic_search_pipeline.add_node(component = retriever,
|
|
|
331 |
|
332 |
elif reader_model :
|
333 |
reader = FARMReader(model_name_or_path=reader_model,
|
334 |
+
top_k = reader_top_k, use_gpu=True,
|
335 |
+
top_k_per_candidate = top_k_per_candidate)
|
336 |
semantic_search_pipeline.add_node(component = retriever,
|
337 |
name = "EmbeddingRetriever",inputs = ["Query"])
|
338 |
semantic_search_pipeline.add_node(component = reader,
|