Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

prashant

lexical search haystack plus spacy

4a20529 almost 2 years ago

2.16 kB

	from haystack.nodes import TfidfRetriever
	from haystack.document_stores import InMemoryDocumentStore
	import configparser
	import spacy
	import re
	from spacy.matcher import Matcher
	import streamlit as st
	from markdown import markdown
	from annotated_text import annotation

	config = configparser.ConfigParser()
	config.read_file(open('paramconfig.py'))


	def tokenize_lexical_query(query):
	nlp = spacy.load("en_core_web_sm")
	token_list = [token.text.lower() for token in nlp(query) if not token.is_stop]
	return token_list

	def runSpacyMatcher(token_list, document):
	nlp = spacy.load("en_core_web_sm")
	spacydoc = nlp(document)
	matcher = Matcher(nlp.vocab)
	token_pattern = [[{"LOWER":token}] for token in token_list]
	matcher.add(",".join(token_list), token_pattern)
	spacymatches = matcher(spacydoc)

	matches = []
	for match_id, start, end in spacymatches:
	matches = matches + [[start, end]]

	return matches, spacydoc

	def runRegexMatcher(token_list, document):
	matches = []
	for token in token_list:
	matches = matches + [[val.start(), val.start()+ len(token)] for val in re.finditer(token, document)]

	return matches, document

	def searchAnnotator(matches, document):
	start = 0
	annotated_text = ""
	for match in matches:
	start_idx = match[0]
	end_idx = match[1]
	annotated_text = annotated_text + document[start:start_idx] + str(annotation(body=document[start_idx:end_idx], label="ANSWER", background="#964448", color='#ffffff'))
	start = end_idx

	st.write(
	markdown(annotated_text),
	unsafe_allow_html=True,
	)

	def lexical_search(query,documents):

	document_store = InMemoryDocumentStore()
	document_store.write_documents(documents)
	retriever = TfidfRetriever(document_store)
	results = retriever.retrieve(query=query,
	top_k= int(config.get('lexical_search','TOP_K')))
	query_tokens = tokenize_lexical_query(query)
	for result in results:
	matches, doc = runSpacyMatcher(query_tokens,result.content)
	searchAnnotator(matches, doc)