prashant
commited on
Commit
•
eaa8795
1
Parent(s):
dd2ea3c
updating keywordslist and about app
Browse files- appStore/keyword_search.py +20 -11
- docStore/sample/keywordexample.json +2 -1
- paramconfig.cfg +3 -6
- utils/semantic_search.py +1 -1
appStore/keyword_search.py
CHANGED
@@ -53,7 +53,8 @@ def app():
|
|
53 |
st.write("")
|
54 |
st.write(""" The application allows its user to perform a keyword search\
|
55 |
based on two options: a lexical ([TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf))\
|
56 |
-
search and semantic bi-encoder
|
|
|
57 |
approaches is quite straightforward; while the lexical search only \
|
58 |
displays paragraphs in the document with exact matching results, \
|
59 |
the semantic search shows paragraphs with meaningful connections \
|
@@ -62,9 +63,12 @@ def app():
|
|
62 |
methods employ a probabilistic retrieval framework in its identification\
|
63 |
of relevant paragraphs. By defualt the search is performed using \
|
64 |
'Semantic Search', and to find 'Exact/Lexical Matches' please tick the \
|
65 |
-
checkbox provided
|
66 |
the application allows the user to search for pre-defined keywords \
|
67 |
from different thematic buckets present in sidebar.""")
|
|
|
|
|
|
|
68 |
|
69 |
|
70 |
with st.sidebar:
|
@@ -82,17 +86,22 @@ def app():
|
|
82 |
st.markdown("---")
|
83 |
|
84 |
with st.container():
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
# queryList = st.text_input("You selected the {} category we \
|
87 |
-
# will look for these keywords in document".format(genre)
|
88 |
# value="{}".format(keywordList))
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
value = "{}".format(keywordList))
|
96 |
searchtype = st.checkbox("Show only Exact Matches")
|
97 |
if st.button("Find them"):
|
98 |
|
|
|
53 |
st.write("")
|
54 |
st.write(""" The application allows its user to perform a keyword search\
|
55 |
based on two options: a lexical ([TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf))\
|
56 |
+
search and semantic [bi-encoder](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)\
|
57 |
+
search. The difference between both \
|
58 |
approaches is quite straightforward; while the lexical search only \
|
59 |
displays paragraphs in the document with exact matching results, \
|
60 |
the semantic search shows paragraphs with meaningful connections \
|
|
|
63 |
methods employ a probabilistic retrieval framework in its identification\
|
64 |
of relevant paragraphs. By defualt the search is performed using \
|
65 |
'Semantic Search', and to find 'Exact/Lexical Matches' please tick the \
|
66 |
+
checkbox provided which will by-pass semantic search. Furthermore,\
|
67 |
the application allows the user to search for pre-defined keywords \
|
68 |
from different thematic buckets present in sidebar.""")
|
69 |
+
st.write("")
|
70 |
+
st.write(""" The Exact Matches gives back top {} findings, and Semantic
|
71 |
+
search provides with top {} answers.""".format(lexical_top_k, retriever_top_k))
|
72 |
|
73 |
|
74 |
with st.sidebar:
|
|
|
86 |
st.markdown("---")
|
87 |
|
88 |
with st.container():
|
89 |
+
type_hinting = "Please enter here your question and we \
|
90 |
+
will look for an answer in the document\
|
91 |
+
OR enter the keyword you are looking \
|
92 |
+
for and we will we will look for similar\
|
93 |
+
context in the document. If dont have anything,\
|
94 |
+
try the presets of keywords from sidebar. "
|
95 |
+
if keywordList is not None:
|
96 |
# queryList = st.text_input("You selected the {} category we \
|
97 |
+
# will look for these keywords in document".format(genre)
|
98 |
# value="{}".format(keywordList))
|
99 |
+
queryList = st.text_input(type_hinting,
|
100 |
+
value = "{}".format(keywordList))
|
101 |
+
else:
|
102 |
+
queryList = st.text_input(type_hinting,
|
103 |
+
placeholder="Enter keyword/query here")
|
104 |
+
|
|
|
105 |
searchtype = st.checkbox("Show only Exact Matches")
|
106 |
if st.button("Find them"):
|
107 |
|
docStore/sample/keywordexample.json
CHANGED
@@ -3,5 +3,6 @@
|
|
3 |
"Food":"Food security,Nutrition,Diets,Food loss",
|
4 |
"Implementation":"Implementation,transformation,reform,integration,strategy,policy",
|
5 |
"Nature":"Nature,Nature-based solutions,Biodiversity,Degradation",
|
6 |
-
"Social":"Indigenous,Local community(ies),
|
|
|
7 |
}
|
|
|
3 |
"Food":"Food security,Nutrition,Diets,Food loss",
|
4 |
"Implementation":"Implementation,transformation,reform,integration,strategy,policy",
|
5 |
"Nature":"Nature,Nature-based solutions,Biodiversity,Degradation",
|
6 |
+
"Social":"Indigenous,Local community(ies),Rural livelihoods,Minority",
|
7 |
+
"Gender":"gender, women empowernment, women economic power, gender bias"
|
8 |
}
|
paramconfig.cfg
CHANGED
@@ -14,10 +14,9 @@ EMBEDDING_DIM = 768
|
|
14 |
RETRIEVER_EMB_LAYER = -1
|
15 |
READER = deepset/tinyroberta-squad2
|
16 |
READER_TOP_K = 10
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
SPLIT_OVERLAP = 1
|
21 |
RESPECT_SENTENCE_BOUNDARY = 1
|
22 |
REMOVE_PUNC = 0
|
23 |
|
@@ -31,8 +30,6 @@ SPLIT_OVERLAP = 10
|
|
31 |
RESPECT_SENTENCE_BOUNDARY = 1
|
32 |
TOP_KEY = 15
|
33 |
|
34 |
-
[tfidf]
|
35 |
-
TOP_N = 20
|
36 |
|
37 |
[coherence]
|
38 |
RETRIEVER_TOP_K = 10
|
|
|
14 |
RETRIEVER_EMB_LAYER = -1
|
15 |
READER = deepset/tinyroberta-squad2
|
16 |
READER_TOP_K = 10
|
17 |
+
SPLIT_BY = word
|
18 |
+
SPLIT_LENGTH = 120
|
19 |
+
SPLIT_OVERLAP = 10
|
|
|
20 |
RESPECT_SENTENCE_BOUNDARY = 1
|
21 |
REMOVE_PUNC = 0
|
22 |
|
|
|
30 |
RESPECT_SENTENCE_BOUNDARY = 1
|
31 |
TOP_KEY = 15
|
32 |
|
|
|
|
|
33 |
|
34 |
[coherence]
|
35 |
RETRIEVER_TOP_K = 10
|
utils/semantic_search.py
CHANGED
@@ -450,7 +450,7 @@ def process_semantic_output(results):
|
|
450 |
'reader_score','retriever_score','id',]. Distingushes if its single query or
|
451 |
multi queries by reading the pipeline output dictionary keys.
|
452 |
Uses the process_query_output to get the dataframe for each query and create
|
453 |
-
one concataneted dataframe. In case
|
454 |
the answers part. See documentations of process_query_output.
|
455 |
|
456 |
Params
|
|
|
450 |
'reader_score','retriever_score','id',]. Distingushes if its single query or
|
451 |
multi queries by reading the pipeline output dictionary keys.
|
452 |
Uses the process_query_output to get the dataframe for each query and create
|
453 |
+
one concataneted dataframe. In case of Docs2Answers as final node, deletes
|
454 |
the answers part. See documentations of process_query_output.
|
455 |
|
456 |
Params
|