bakrianoo commited on
Commit
2bb1643
1 Parent(s): dde5302

fix [grouped_entities] output

Browse files
Files changed (1) hide show
  1. README.md +29 -1
README.md CHANGED
@@ -33,6 +33,7 @@ Install transformers AND nltk (python >= 3.6)
33
 
34
  ```python
35
  # we need to install NLTK punkt to be used for word tokenization
 
36
  import nltk
37
  nltk.download('punkt')
38
  from nltk.tokenize import word_tokenize
@@ -55,8 +56,35 @@ example = " ".join(word_tokenize(example))
55
  # feed to the NER model to parse
56
  ner_results = nlp(example)
57
 
58
- # ===== print the ner_results
 
 
 
 
59
  for ent in ner_results:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
61
 
62
  #####
 
33
 
34
  ```python
35
  # we need to install NLTK punkt to be used for word tokenization
36
+ from collections import defaultdict
37
  import nltk
38
  nltk.download('punkt')
39
  from nltk.tokenize import word_tokenize
 
56
  # feed to the NER model to parse
57
  ner_results = nlp(example)
58
 
59
+ # as the [grouped_entities] parameter does not perform well for Arabic,
60
+ # we prepared a simple fixing code to generate the full entities tokens
61
+
62
+ grouped_ner_results = defaultdict(list)
63
+ fixed_ner_results = []
64
  for ent in ner_results:
65
+ grouped_ner_results[ent['entity_group']].append(ent)
66
+
67
+
68
+ for group, ents in grouped_ner_results.items():
69
+ if len(ents) == 1:
70
+ fixed_ner_results.append(ents[0])
71
+ continue
72
+
73
+ last_ent, last_start, last_end = ents[0]['word'], ents[0]['start'], ents[0]['end']
74
+ current_ent = {"word": ents[0]['word'], "start": ents[0]['start'], "end": ents[0]['end'], "entity_group": group, "score": ents[0]['score']}
75
+ for i in range(1, len(ents)):
76
+ if ents[i]['start'] == current_ent["end"]:
77
+ current_ent["word"] += ents[i]['word']
78
+ current_ent["end"] = ents[i]['end']
79
+ current_ent["score"] = max(ents[i]['score'], current_ent["score"])
80
+ else:
81
+ fixed_ner_results.append(current_ent)
82
+ current_ent = {"word": ents[i]['word'], "start": ents[i]['start'], "end": ents[i]['end'], "entity_group": group, "score": ents[i]['score']}
83
+
84
+ fixed_ner_results.append(current_ent)
85
+
86
+ # ===== print the ner_results
87
+ for ent in fixed_ner_results:
88
  print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
89
 
90
  #####