keyword-extraction-viet / model /named_entities.py
Thao Pham
Adding app.py and pipeline.py, changed code structure
664c81e
raw
history blame
1.36 kB
from underthesea import sent_tokenize
def substring(w, ls):
for w2 in ls:
if w != w2 and w in w2:
return True
return False
def get_ner_phrases(sent_ner_result):
ner_list = []
current_ner = [sent_ner_result[0]["word"]]
current_idx = sent_ner_result[0]["index"]
for i in range(1, len(sent_ner_result)):
if sent_ner_result[i]["index"] == current_idx + 1:
current_ner.append(sent_ner_result[i]["word"])
else:
ner_list.append((' '.join(current_ner), sent_ner_result[i - 1]['entity']))
current_ner = [sent_ner_result[i]["word"]]
current_idx = sent_ner_result[i]["index"]
ner_list.append((' '.join(current_ner), sent_ner_result[len(sent_ner_result) - 1]['entity']))
return ner_list
def get_named_entities(nlp, doc):
ner_lists = []
for sent in sent_tokenize(doc):
sent_ner_result = nlp(sent)
if len(sent_ner_result) > 0:
ner_lists += get_ner_phrases(sent_ner_result)
# print(ner_lists)
ner_list_non_dup = []
for (entity, ner_type) in ner_lists:
if entity not in ner_list_non_dup and ner_type.startswith('I'):
ner_list_non_dup.append(entity)
ner_list_final = [w.replace(" ##", "") for w in ner_list_non_dup if not substring(w, ner_list_non_dup)]
return ner_list_final