|
|
|
import glob, os, sys; |
|
sys.path.append('../utils') |
|
|
|
import streamlit as st |
|
import json |
|
import logging |
|
from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search |
|
from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch |
|
from utils.checkconfig import getconfig |
|
from utils.streamlitcheck import checkbox_without_preselect |
|
|
|
|
|
config = getconfig('paramconfig.cfg') |
|
split_by = config.get('semantic_search','SPLIT_BY') |
|
split_length = int(config.get('semantic_search','SPLIT_LENGTH')) |
|
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP')) |
|
split_respect_sentence_boundary = bool(int(config.get('semantic_search', |
|
'RESPECT_SENTENCE_BOUNDARY'))) |
|
remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC'))) |
|
embedding_model = config.get('semantic_search','RETRIEVER') |
|
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT') |
|
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER')) |
|
embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM')) |
|
max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH')) |
|
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K')) |
|
reader_model = config.get('semantic_search','READER') |
|
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K')) |
|
top_k_per_candidate = int(config.get('semantic_search','READER_TOP_K_PER_CANDIDATE')) |
|
lexical_split_by= config.get('lexical_search','SPLIT_BY') |
|
lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH')) |
|
lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP')) |
|
lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC'))) |
|
lexical_top_k=int(config.get('lexical_search','TOP_K')) |
|
|
|
def app(): |
|
|
|
with st.container(): |
|
st.markdown("<h1 style='text-align: center; \ |
|
color: black;'> Search</h1>", |
|
unsafe_allow_html=True) |
|
st.write(' ') |
|
st.write(' ') |
|
|
|
with st.expander("ℹ️ - About this app", expanded=False): |
|
|
|
st.write( |
|
""" |
|
The *Keyword Search* app is an easy-to-use interface \ |
|
built in Streamlit for doing keyword search in \ |
|
policy document - developed by GIZ Data and the \ |
|
Sustainable Development Solution Network. |
|
""") |
|
st.write("") |
|
st.write(""" The application allows its user to perform a keyword search\ |
|
based on two options: a lexical ([TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf))\ |
|
search and semantic [bi-encoder](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)\ |
|
search. The difference between both \ |
|
approaches is quite straightforward; while the lexical search only \ |
|
displays paragraphs in the document with exact matching results, \ |
|
the semantic search shows paragraphs with meaningful connections \ |
|
(e.g., synonyms) based on the context as well. The semantic search \ |
|
allows for a personalized experience in using the application. Both \ |
|
methods employ a probabilistic retrieval framework in its identification\ |
|
of relevant paragraphs. By defualt the search is performed using \ |
|
'Semantic Search', and to find 'Exact/Lexical Matches' please tick the \ |
|
checkbox provided which will by-pass semantic search. Furthermore,\ |
|
the application allows the user to search for pre-defined keywords \ |
|
from different thematic buckets present in sidebar.""") |
|
st.write("") |
|
st.write(""" The Exact Matches gives back top {} findings, and Semantic |
|
search provides with top {} answers.""".format(lexical_top_k, retriever_top_k)) |
|
st.write("") |
|
st.write("") |
|
st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB") |
|
col1,col2,col3= st.columns([2,4,4]) |
|
with col1: |
|
st.caption("OCR File processing") |
|
|
|
st.write("50 sec") |
|
|
|
with col2: |
|
st.caption("Lexical Search on 200 paragraphs(~ 35 pages)") |
|
|
|
st.write("15 sec") |
|
|
|
with col3: |
|
st.caption("Semantic search on 200 paragraphs(~ 35 pages)") |
|
|
|
st.write("120 sec(including emebedding creation)") |
|
|
|
with st.sidebar: |
|
with open('docStore/sample/keywordexample.json','r') as json_file: |
|
keywordexample = json.load(json_file) |
|
|
|
|
|
st.caption("Select Keyword Category") |
|
genre = checkbox_without_preselect(list(keywordexample.keys())) |
|
if genre: |
|
keywordList = keywordexample[genre] |
|
else: |
|
keywordList = None |
|
|
|
st.markdown("---") |
|
|
|
with st.container(): |
|
type_hinting = "Please enter here your question and we \ |
|
will look for an answer in the document\ |
|
OR enter the keyword you are looking \ |
|
for and we will we will look for similar\ |
|
context in the document. If dont have anything,\ |
|
try the presets of keywords from sidebar. " |
|
if keywordList is not None: |
|
|
|
|
|
|
|
queryList = st.text_input(type_hinting, |
|
value = "{}".format(keywordList)) |
|
else: |
|
queryList = st.text_input(type_hinting, |
|
placeholder="Enter keyword/query here") |
|
|
|
searchtype = st.checkbox("Show only Exact Matches") |
|
if st.button("Find them"): |
|
|
|
if queryList == "": |
|
st.info("🤔 No keyword provided, if you dont have any, \ |
|
please try example sets from sidebar!") |
|
logging.warning("Terminated as no keyword provided") |
|
else: |
|
if 'filepath' in st.session_state: |
|
|
|
if searchtype: |
|
all_documents = runLexicalPreprocessingPipeline( |
|
file_name=st.session_state['filename'], |
|
file_path=st.session_state['filepath'], |
|
split_by=lexical_split_by, |
|
split_length=lexical_split_length, |
|
split_overlap=lexical_split_overlap, |
|
remove_punc=lexical_remove_punc) |
|
logging.info("performing lexical search") |
|
with st.spinner("Performing Exact matching search \ |
|
(Lexical search) for you"): |
|
lexical_search(query=queryList, |
|
documents = all_documents['documents'], |
|
top_k = lexical_top_k ) |
|
else: |
|
all_documents = runSemanticPreprocessingPipeline( |
|
file_path= st.session_state['filepath'], |
|
file_name = st.session_state['filename'], |
|
split_by=split_by, |
|
split_length= split_length, |
|
split_overlap=split_overlap, |
|
remove_punc= remove_punc, |
|
split_respect_sentence_boundary=split_respect_sentence_boundary) |
|
if len(all_documents['documents']) > 100: |
|
warning_msg = ": This might take sometime, please sit back and relax." |
|
else: |
|
warning_msg = "" |
|
|
|
logging.info("starting semantic search") |
|
with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)): |
|
semantic_keywordsearch(query = queryList, |
|
documents = all_documents['documents'], |
|
embedding_model=embedding_model, |
|
embedding_layer=embedding_layer, |
|
embedding_model_format=embedding_model_format, |
|
reader_model=reader_model,reader_top_k=reader_top_k, |
|
retriever_top_k=retriever_top_k, embedding_dim=embedding_dim, |
|
max_seq_len=max_seq_len, |
|
top_k_per_candidate = top_k_per_candidate) |
|
|
|
else: |
|
st.info("🤔 No document found, please try to upload it at the sidebar!") |
|
logging.warning("Terminated as no document provided") |
|
|
|
|
|
|
|
|
|
|