fix [grouped_entities] output
Browse files
README.md
CHANGED
@@ -33,6 +33,7 @@ Install transformers AND nltk (python >= 3.6)
|
|
33 |
|
34 |
```python
|
35 |
# we need to install NLTK punkt to be used for word tokenization
|
|
|
36 |
import nltk
|
37 |
nltk.download('punkt')
|
38 |
from nltk.tokenize import word_tokenize
|
@@ -55,8 +56,35 @@ example = " ".join(word_tokenize(example))
|
|
55 |
# feed to the NER model to parse
|
56 |
ner_results = nlp(example)
|
57 |
|
58 |
-
#
|
|
|
|
|
|
|
|
|
59 |
for ent in ner_results:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
|
61 |
|
62 |
#####
|
|
|
33 |
|
34 |
```python
|
35 |
# we need to install NLTK punkt to be used for word tokenization
|
36 |
+
from collections import defaultdict
|
37 |
import nltk
|
38 |
nltk.download('punkt')
|
39 |
from nltk.tokenize import word_tokenize
|
|
|
56 |
# feed to the NER model to parse
|
57 |
ner_results = nlp(example)
|
58 |
|
59 |
+
# as the [grouped_entities] parameter does not perform well for Arabic,
|
60 |
+
# we prepared a simple fixing code to generate the full entities tokens
|
61 |
+
|
62 |
+
grouped_ner_results = defaultdict(list)
|
63 |
+
fixed_ner_results = []
|
64 |
for ent in ner_results:
|
65 |
+
grouped_ner_results[ent['entity_group']].append(ent)
|
66 |
+
|
67 |
+
|
68 |
+
for group, ents in grouped_ner_results.items():
|
69 |
+
if len(ents) == 1:
|
70 |
+
fixed_ner_results.append(ents[0])
|
71 |
+
continue
|
72 |
+
|
73 |
+
last_ent, last_start, last_end = ents[0]['word'], ents[0]['start'], ents[0]['end']
|
74 |
+
current_ent = {"word": ents[0]['word'], "start": ents[0]['start'], "end": ents[0]['end'], "entity_group": group, "score": ents[0]['score']}
|
75 |
+
for i in range(1, len(ents)):
|
76 |
+
if ents[i]['start'] == current_ent["end"]:
|
77 |
+
current_ent["word"] += ents[i]['word']
|
78 |
+
current_ent["end"] = ents[i]['end']
|
79 |
+
current_ent["score"] = max(ents[i]['score'], current_ent["score"])
|
80 |
+
else:
|
81 |
+
fixed_ner_results.append(current_ent)
|
82 |
+
current_ent = {"word": ents[i]['word'], "start": ents[i]['start'], "end": ents[i]['end'], "entity_group": group, "score": ents[i]['score']}
|
83 |
+
|
84 |
+
fixed_ner_results.append(current_ent)
|
85 |
+
|
86 |
+
# ===== print the ner_results
|
87 |
+
for ent in fixed_ner_results:
|
88 |
print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
|
89 |
|
90 |
#####
|