|
from haystack.nodes import TfidfRetriever |
|
from haystack.document_stores import InMemoryDocumentStore |
|
import configparser |
|
import spacy |
|
import re |
|
from spacy.matcher import Matcher |
|
import streamlit as st |
|
from markdown import markdown |
|
from annotated_text import annotation |
|
|
|
config = configparser.ConfigParser() |
|
config.read_file(open('paramconfig.py')) |
|
|
|
|
|
def tokenize_lexical_query(query): |
|
nlp = spacy.load("en_core_web_sm") |
|
token_list = [token.text.lower() for token in nlp(query) if not token.is_stop] |
|
return token_list |
|
|
|
def runSpacyMatcher(token_list, document): |
|
nlp = spacy.load("en_core_web_sm") |
|
spacydoc = nlp(document) |
|
matcher = Matcher(nlp.vocab) |
|
token_pattern = [[{"LOWER":token}] for token in token_list] |
|
matcher.add(",".join(token_list), token_pattern) |
|
spacymatches = matcher(spacydoc) |
|
|
|
matches = [] |
|
for match_id, start, end in spacymatches: |
|
matches = matches + [[start, end]] |
|
|
|
return matches, spacydoc |
|
|
|
def runRegexMatcher(token_list, document): |
|
matches = [] |
|
for token in token_list: |
|
matches = matches + [[val.start(), val.start()+ len(token)] for val in re.finditer(token, document)] |
|
|
|
return matches, document |
|
|
|
def searchAnnotator(matches, document): |
|
start = 0 |
|
annotated_text = "" |
|
for match in matches: |
|
start_idx = match[0] |
|
end_idx = match[1] |
|
annotated_text = annotated_text + document[start:start_idx] + str(annotation(body=document[start_idx:end_idx], label="ANSWER", background="#964448", color='#ffffff')) |
|
start = end_idx |
|
|
|
st.write( |
|
markdown(annotated_text), |
|
unsafe_allow_html=True, |
|
) |
|
|
|
def lexical_search(query,documents): |
|
|
|
document_store = InMemoryDocumentStore() |
|
document_store.write_documents(documents) |
|
retriever = TfidfRetriever(document_store) |
|
results = retriever.retrieve(query=query, |
|
top_k= int(config.get('lexical_search','TOP_K'))) |
|
query_tokens = tokenize_lexical_query(query) |
|
for result in results: |
|
matches, doc = runSpacyMatcher(query_tokens,result.content) |
|
searchAnnotator(matches, doc) |
|
|
|
|
|
|
|
|