poccio commited on
Commit
2c67aa0
1 Parent(s): d1f912d

initial commit

Browse files
Files changed (1) hide show
  1. app.py +124 -123
app.py CHANGED
@@ -44,143 +44,144 @@ def main(
44
  """
45
  <div align="center">
46
  <a href="https://sunglasses-ai.github.io/classy/">
47
- <img alt="Python" style="height: 3em; margin: 0 1em" src="">
48
  </a>
49
  <a href="https://spacy.io/" tyle="text-decoration: none">
50
- <img alt="spaCy" style="height: 3em; margin: 0 1em;" src="">
51
  </a>
52
  </div>
53
  """,
54
  unsafe_allow_html=True,
55
  )
56
 
57
- # description
58
- st.write(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  """
60
- Local models for Entity Disambiguation (ED) have today become extremely powerful, in most part thanks to the advent of large pre-trained language models. However, despite their significant performance achievements, most of these approaches frame ED through classification formulations that have intrinsic limitations, both computationally and from a modeling perspective. In contrast with this trend, here we propose EXTEND, a novel local formulation for ED where we frame this task as a text extraction problem, and present two Transformer-based architectures that implement it. Based on experiments in and out of domain, and training over two different data regimes, we find our approach surpasses all its competitors in terms of both data efficiency and raw performance. EXTEND outperforms its alternatives by as few as 6 F 1 points on the more constrained of the two data regimes and, when moving to the other higher-resourced regime, sets a new state of the art on 4 out of 6 benchmarks under consideration, with average improvements of 0.7 F 1 points overall and 1.1 F 1 points out of domain. In addition, to gain better insights from our results, we also perform a fine-grained evaluation of our performances on different classes of label frequency, along with an ablation study of our architectural choices and an error analysis. We release our code and models for research purposes at https:// github.com/SapienzaNLP/extend.
61
-
62
- Link to full paper: https://www.researchgate.net/publication/359392427_ExtEnD_Extractive_Entity_Disambiguation
63
- Link to GitHub paper: https://github.com/SapienzaNLP/extend
64
- """
65
- )
66
- st.markdown("""
67
- ## How it works
68
-
69
- ExtEnD frames Entity Disambiguation as a text extraction problem:
70
- """)
71
- st.image(
72
- "data/repo-assets/extend_formulation.png", caption="ExtEnD Formulation"
73
- )
74
- st.markdown(
75
- """
76
- Given the sentence *After a long fight Superman saved Metropolis*, where *Superman* is the mention
77
- to disambiguate, ExtEnD first concatenates the descriptions of all the possible candidates of *Superman* in the
78
- inventory and then selects the span whose description best suits the mention in its context.
79
-
80
- To convert this task to end2end entity linking, as we do in *Model demo*, we leverage spaCy
81
- (more specifically, its NER) and run ExtEnD on each named entity spaCy identifies
82
- (if the corresponding mention is contained in the inventory).
83
- """
84
- )
85
 
86
  # demo
87
- st.markdown("## Demo")
88
-
89
- @st.cache(allow_output_mutation=True)
90
- def load_resources(inventory_path):
91
-
92
- # load nlp
93
- nlp = spacy.load("en_core_web_sm")
94
- extend_config = dict(
95
- checkpoint_path=model_checkpoint_path,
96
- mentions_inventory_path=inventory_path,
97
- device=cuda_device,
98
- tokens_per_batch=10_000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  )
100
- nlp.add_pipe("extend", after="ner", config=extend_config)
101
 
102
- # mock call to load resources
103
- nlp(examples[0])
104
-
105
- # return
106
- return nlp
107
-
108
- # read input
109
- placeholder = st.selectbox(
110
- "Examples",
111
- options=examples,
112
- index=0,
113
- )
114
- input_text = st.text_area("Input text to entity-disambiguate", placeholder)
115
-
116
- # custom inventory
117
- uploaded_inventory_path = st.file_uploader(
118
- "[Optional] Upload custom inventory (tsv file, mention \\t desc1 \\t desc2 \\t)",
119
- accept_multiple_files=False,
120
- type=["tsv"],
121
- )
122
- if uploaded_inventory_path is not None:
123
- inventory_path = f"data/inventories/{uploaded_inventory_path.name}"
124
- with open(inventory_path, "wb") as f:
125
- f.write(uploaded_inventory_path.getbuffer())
126
- else:
127
- inventory_path = default_inventory_path
128
-
129
- # load model and color generator
130
- nlp = load_resources(inventory_path)
131
- color_generator = get_md_200_random_color_generator()
132
-
133
- if st.button("Disambiguate", key="classify"):
134
-
135
- # tag sentence
136
- time_start = time.perf_counter()
137
- doc = nlp(input_text)
138
- time_end = time.perf_counter()
139
-
140
- # extract entities
141
- entities = {}
142
- for ent in doc.ents:
143
- if ent._.disambiguated_entity is not None:
144
- entities[ent.start_char] = (
145
- ent.start_char,
146
- ent.end_char,
147
- ent.text,
148
- ent._.disambiguated_entity,
149
- )
150
-
151
- # create annotated html components
152
-
153
- annotated_html_components = []
154
-
155
- assert all(any(t.idx == _s for t in doc) for _s in entities)
156
- it = iter(list(doc))
157
- while True:
158
- try:
159
- t = next(it)
160
- except StopIteration:
161
- break
162
- if t.idx in entities:
163
- _start, _end, _text, _entity = entities[t.idx]
164
- while t.idx + len(t) != _end:
165
- t = next(it)
166
- annotated_html_components.append(
167
- str(annotation(*(_text, _entity, color_generator())))
168
- )
169
- else:
170
- annotated_html_components.append(str(html.escape(t.text)))
171
-
172
- st.markdown(
173
- "\n".join(
174
- [
175
- "<div>",
176
- *annotated_html_components,
177
- "<p></p>"
178
- f'<div style="text-align: right"><p style="color: gray">Time: {(time_end - time_start):.2f}s</p></div>'
179
- "</div>",
180
- ]
181
- ),
182
- unsafe_allow_html=True,
183
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
 
186
 
 
44
  """
45
  <div align="center">
46
  <a href="https://sunglasses-ai.github.io/classy/">
47
+ <img alt="Python" style="height: 3em; margin: 0em 1em 2em 1em;" src="">
48
  </a>
49
  <a href="https://spacy.io/" tyle="text-decoration: none">
50
+ <img alt="spaCy" style="height: 3em; margin: 0em 1em 2em 1em;" src="">
51
  </a>
52
  </div>
53
  """,
54
  unsafe_allow_html=True,
55
  )
56
 
57
+ # how it works
58
+ def hiw():
59
+ st.markdown("""
60
+ ## How it works
61
+
62
+ ExtEnD frames Entity Disambiguation as a text extraction problem:
63
+ """)
64
+ st.image(
65
+ "data/repo-assets/extend_formulation.png", caption="ExtEnD Formulation"
66
+ )
67
+ st.markdown(
68
+ """
69
+ Given the sentence *After a long fight Superman saved Metropolis*, where *Superman* is the mention
70
+ to disambiguate, ExtEnD first concatenates the descriptions of all the possible candidates of *Superman* in the
71
+ inventory and then selects the span whose description best suits the mention in its context.
72
+
73
+ To convert this task to end2end entity linking, as we do in *Model demo*, we leverage spaCy
74
+ (more specifically, its NER) and run ExtEnD on each named entity spaCy identifies
75
+ (if the corresponding mention is contained in the inventory).
76
+
77
+ Links:
78
+ * [full paper](https://www.researchgate.net/publication/359392427_ExtEnD_Extractive_Entity_Disambiguation)
79
+ * [GitHub](https://github.com/SapienzaNLP/extend)
80
  """
81
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  # demo
84
+ def demo():
85
+ st.markdown("## Demo")
86
+
87
+ @st.cache(allow_output_mutation=True)
88
+ def load_resources(inventory_path):
89
+
90
+ # load nlp
91
+ nlp = spacy.load("en_core_web_sm")
92
+ extend_config = dict(
93
+ checkpoint_path=model_checkpoint_path,
94
+ mentions_inventory_path=inventory_path,
95
+ device=cuda_device,
96
+ tokens_per_batch=10_000,
97
+ )
98
+ nlp.add_pipe("extend", after="ner", config=extend_config)
99
+
100
+ # mock call to load resources
101
+ nlp(examples[0])
102
+
103
+ # return
104
+ return nlp
105
+
106
+ # read input
107
+ placeholder = st.selectbox(
108
+ "Examples",
109
+ options=examples,
110
+ index=0,
111
  )
112
+ input_text = st.text_area("Input text to entity-disambiguate", placeholder)
113
 
114
+ # custom inventory
115
+ uploaded_inventory_path = st.file_uploader(
116
+ "[Optional] Upload custom inventory (tsv file, mention \\t desc1 \\t desc2 \\t)",
117
+ accept_multiple_files=False,
118
+ type=["tsv"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  )
120
+ if uploaded_inventory_path is not None:
121
+ inventory_path = f"data/inventories/{uploaded_inventory_path.name}"
122
+ with open(inventory_path, "wb") as f:
123
+ f.write(uploaded_inventory_path.getbuffer())
124
+ else:
125
+ inventory_path = default_inventory_path
126
+
127
+ # load model and color generator
128
+ nlp = load_resources(inventory_path)
129
+ color_generator = get_md_200_random_color_generator()
130
+
131
+ if st.button("Disambiguate", key="classify"):
132
+
133
+ # tag sentence
134
+ time_start = time.perf_counter()
135
+ doc = nlp(input_text)
136
+ time_end = time.perf_counter()
137
+
138
+ # extract entities
139
+ entities = {}
140
+ for ent in doc.ents:
141
+ if ent._.disambiguated_entity is not None:
142
+ entities[ent.start_char] = (
143
+ ent.start_char,
144
+ ent.end_char,
145
+ ent.text,
146
+ ent._.disambiguated_entity,
147
+ )
148
+
149
+ # create annotated html components
150
+
151
+ annotated_html_components = []
152
+
153
+ assert all(any(t.idx == _s for t in doc) for _s in entities)
154
+ it = iter(list(doc))
155
+ while True:
156
+ try:
157
+ t = next(it)
158
+ except StopIteration:
159
+ break
160
+ if t.idx in entities:
161
+ _start, _end, _text, _entity = entities[t.idx]
162
+ while t.idx + len(t) != _end:
163
+ t = next(it)
164
+ annotated_html_components.append(
165
+ str(annotation(*(_text, _entity, color_generator())))
166
+ )
167
+ else:
168
+ annotated_html_components.append(str(html.escape(t.text)))
169
+
170
+ st.markdown(
171
+ "\n".join(
172
+ [
173
+ "<div>",
174
+ *annotated_html_components,
175
+ "<p></p>"
176
+ f'<div style="text-align: right"><p style="color: gray">Time: {(time_end - time_start):.2f}s</p></div>'
177
+ "</div>",
178
+ ]
179
+ ),
180
+ unsafe_allow_html=True,
181
+ )
182
+
183
+ demo()
184
+ hiw()
185
 
186
 
187