Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

SDSN-demo / ver0.1 scripts /keyword_search.py

prashant

lexical search app update

cc5c327 about 2 years ago

7.46 kB

	# set path
	import glob, os, sys
	from udfPreprocess.search import semantic_search
	sys.path.append('../udfPreprocess')

	#import helper
	import udfPreprocess.docPreprocessing as pre
	import udfPreprocess.cleaning as clean
	from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search
	#import needed libraries
	import seaborn as sns
	from pandas import DataFrame
	from sentence_transformers import SentenceTransformer, CrossEncoder, util
	# from keybert import KeyBERT
	from transformers import pipeline
	import matplotlib.pyplot as plt
	import numpy as np
	import streamlit as st
	import pandas as pd
	from rank_bm25 import BM25Okapi
	from sklearn.feature_extraction import _stop_words
	import string
	from tqdm.autonotebook import tqdm
	import numpy as np
	import docx
	from docx.shared import Inches
	from docx.shared import Pt
	from docx.enum.style import WD_STYLE_TYPE
	import logging
	logger = logging.getLogger(__name__)
	import tempfile
	import sqlite3
	import json
	import configparser


	def app():

	with st.container():
	st.markdown("<h1 style='text-align: center; \
	color: black;'> Search</h1>",
	unsafe_allow_html=True)
	st.write(' ')
	st.write(' ')

	with st.expander("ℹ️ - About this app", expanded=False):

	st.write(
	"""
	The Keyword Search app is an easy-to-use interface \
	built in Streamlit for doing keyword search in \
	policy document - developed by GIZ Data and the \
	Sustainable Development Solution Network.
	""")

	st.markdown("")



	with st.sidebar:
	with open('sample/keywordexample.json','r') as json_file:
	keywordexample = json.load(json_file)

	genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
	if genre == 'Food':
	keywordList = keywordexample['Food']
	elif genre == 'Climate':
	keywordList = keywordexample['Climate']
	elif genre == 'Social':
	keywordList = keywordexample['Social']
	elif genre == 'Nature':
	keywordList = keywordexample['Nature']
	elif genre == 'Implementation':
	keywordList = keywordexample['Implementation']
	else:
	keywordList = None

	searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])


	with st.container():
	if keywordList is not None:
	queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
	value="{}".format(keywordList))
	else:
	queryList = st.text_input("Please enter here your question and we will look \
	for an answer in the document OR enter the keyword you \
	are looking for and we will \
	we will look for similar context \
	in the document.",
	placeholder="Enter keyword here")

	if st.button("Find them"):

	if queryList == "":
	st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
	logging.warning("Terminated as no keyword provided")
	else:

	if 'docs' in st.session_state:
	docs = st.session_state['docs']
	paraList = st.session_state['paraList']

	if searchtype == 'Exact Matches':
	queryList = list(queryList.split(","))
	logging.info("performing lexical search")
	tokenized_corpus = bm25TokenizeDoc(paraList)
	# st.write(len(tokenized_corpus))
	document_bm25 = BM25Okapi(tokenized_corpus)

	with st.spinner("Performing Exact matching search (Lexical search) for you"):
	st.markdown("##### Top few lexical search (BM25) hits #####")

	for keyword in queryList:

	bm25_hits = lexical_search(keyword,document_bm25)


	counter = 0
	for hit in bm25_hits:
	if hit['score'] > 0.00:
	counter += 1
	if counter == 1:
	st.markdown("###### Results for keyword: {} ######".format(keyword))
	# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
	st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))


	if counter == 0:
	st.write("No results found for '{}' ".format(keyword))

	st.markdown("---")
	else:
	logging.info("starting semantic search")
	with st.spinner("Performing Similar/Contextual search"):
	query = "Find {} related issues ?".format(queryList)
	config = configparser.ConfigParser()
	config.read_file(open('udfPreprocess/paramconfig.cfg'))
	threshold = float(config.get('semantic_search','THRESHOLD'))
	# st.write(query)
	semantic_hits = semantic_search(query,paraList)
	st.markdown("##### Few Semantic search hits for {} related topics #####".format(queryList))

	for i,queryhit in enumerate(semantic_hits):

	# st.markdown("###### Results for query: {} ######".format(queryList[i]))
	counter = 0
	for hit in queryhit:
	counter += 1


	if hit['score'] > threshold:
	# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
	st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))

	# document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
	st.markdown("---")
	# st.write(semantic_hits)




	else:
	st.info("🤔 No document found, please try to upload it at the sidebar!")
	logging.warning("Terminated as no keyword provided")