import argparse
import html
import time
from extend import spacy_component # this is needed to register the spacy component
import spacy
import streamlit as st
from annotated_text import annotation
from classy.utils.streamlit import get_md_200_random_color_generator
def main(
model_checkpoint_path: str,
inventory_path: str,
cuda_device: int,
):
# setup examples
examples = [
"Japan began the defence of their title with a lucky 2-1 win against Syria in a championship match on Friday.",
"The project was coded in Java.",
"Rome is in Italy",
]
# define load_resources
@st.cache(allow_output_mutation=True)
def load_resources(inventory_path):
# load nlp
nlp = spacy.load("en_core_web_sm")
extend_config = dict(
checkpoint_path=model_checkpoint_path,
mentions_inventory_path=inventory_path,
device=cuda_device,
tokens_per_batch=10_000,
)
nlp.add_pipe("extend", after="ner", config=extend_config)
# mock call to load resources
nlp(examples[0])
# return
return nlp
# preload default resources
load_resources(inventory_path)
# css rules
st.write(
"""
""",
unsafe_allow_html=True,
)
# setup header
st.markdown(
"
ExtEnD: Extractive Entity Disambiguation
",
unsafe_allow_html=True,
)
st.write(
"""
""",
unsafe_allow_html=True,
)
# how it works
def hiw():
st.markdown(
"""
## How it works
ExtEnD frames Entity Disambiguation as a text extraction problem:
"""
)
st.image(
"data/repo-assets/extend_formulation.png", caption="ExtEnD Formulation"
)
st.markdown(
"""
Given the sentence *After a long fight Superman saved Metropolis*, where *Superman* is the mention
to disambiguate, ExtEnD first concatenates the descriptions of all the possible candidates of *Superman* in the
inventory and then selects the span whose description best suits the mention in its context.
To use ExtEnD for full end2end entity linking, as we do in *Demo*, we just need to leverage a mention
identifier. Here [we use spaCy](https://github.com/SapienzaNLP/extend#spacy) (more specifically, its NER) and run ExtEnD on each named
entity spaCy identifies (if the corresponding mention is contained in the inventory).
##### Links:
* [Full Paper](https://www.researchgate.net/publication/359392427_ExtEnD_Extractive_Entity_Disambiguation)
* [GitHub](https://github.com/SapienzaNLP/extend)
"""
)
# demo
def demo():
st.markdown("## Demo")
# read input
placeholder = st.selectbox(
"Examples",
options=examples,
index=0,
)
input_text = st.text_area("Input text to entity-disambiguate", placeholder)
# button
should_disambiguate = st.button("Disambiguate", key="classify")
# load model and color generator
nlp = load_resources(inventory_path)
color_generator = get_md_200_random_color_generator()
if should_disambiguate:
# tag sentence
time_start = time.perf_counter()
doc = nlp(input_text)
time_end = time.perf_counter()
# extract entities
entities = {}
for ent in doc.ents:
if ent._.disambiguated_entity is not None:
entities[ent.start_char] = (
ent.start_char,
ent.end_char,
ent.text,
ent._.disambiguated_entity,
)
# create annotated html components
annotated_html_components = []
assert all(any(t.idx == _s for t in doc) for _s in entities)
it = iter(list(doc))
while True:
try:
t = next(it)
except StopIteration:
break
if t.idx in entities:
_start, _end, _text, _entity = entities[t.idx]
while t.idx + len(t) != _end:
t = next(it)
annotated_html_components.append(
f"{annotation(*(_text, _entity, color_generator()))}"
)
else:
annotated_html_components.append(str(html.escape(t.text)))
st.markdown(
"\n".join(
[
"",
*annotated_html_components,
"
"
f'
Time: {(time_end - time_start):.2f}s
'
"
",
]
),
unsafe_allow_html=True,
)
demo()
hiw()
if __name__ == "__main__":
main(
"experiments/extend-longformer-large/2021-10-22/09-11-39/checkpoints/best.ckpt",
"data/inventories/le-and-titov-2018-inventory.min-count-2.sqlite3",
cuda_device=-1,
)