Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

File size: 2,164 Bytes

from haystack.nodes import TfidfRetriever
from haystack.document_stores import InMemoryDocumentStore
import configparser
import spacy
import re
from spacy.matcher import Matcher
import streamlit as st
from markdown import markdown
from annotated_text import annotation

config = configparser.ConfigParser()
config.read_file(open('paramconfig.py'))


def tokenize_lexical_query(query):
    nlp = spacy.load("en_core_web_sm")    
    token_list = [token.text.lower() for token in nlp(query) if not token.is_stop]
    return token_list

def runSpacyMatcher(token_list, document):
    nlp = spacy.load("en_core_web_sm")
    spacydoc = nlp(document)
    matcher = Matcher(nlp.vocab)
    token_pattern = [[{"LOWER":token}] for token in token_list]
    matcher.add(",".join(token_list), token_pattern)
    spacymatches = matcher(spacydoc)

    matches = []
    for match_id, start, end in spacymatches:
        matches = matches + [[start, end]]
    
    return matches, spacydoc

def runRegexMatcher(token_list, document):
    matches = []
    for token in token_list:
        matches = matches + [[val.start(), val.start()+ len(token)] for val in re.finditer(token, document)]
    
    return matches, document

def searchAnnotator(matches, document):
    start = 0
    annotated_text = ""
    for match in matches:
        start_idx = match[0]
        end_idx = match[1]
        annotated_text = annotated_text + document[start:start_idx] + str(annotation(body=document[start_idx:end_idx], label="ANSWER", background="#964448", color='#ffffff'))
        start = end_idx
    
    st.write(
            markdown(annotated_text),
            unsafe_allow_html=True,
        )

def lexical_search(query,documents):

    document_store = InMemoryDocumentStore()
    document_store.write_documents(documents)
    retriever = TfidfRetriever(document_store)
    results = retriever.retrieve(query=query, 
                            top_k= int(config.get('lexical_search','TOP_K')))
    query_tokens = tokenize_lexical_query(query)
    for result in results:
        matches, doc = runSpacyMatcher(query_tokens,result.content)
        searchAnnotator(matches, doc)