|
from typing import List |
|
from resources import set_start, audit_elapsedtime, entities_list_to_dict |
|
from transformers import BertTokenizer, BertForTokenClassification |
|
import torch |
|
from gliner import GLiNER |
|
|
|
|
|
|
|
|
|
def init_model_ner(): |
|
print("Initiating NER model...") |
|
start = set_start() |
|
model = GLiNER.from_pretrained("urchade/gliner_multi") |
|
audit_elapsedtime(function="Initiating NER model", start=start) |
|
return model |
|
|
|
def get_entity_results(model: GLiNER, text: str, entities_list: List[str]): |
|
print("Initiating entity recognition...") |
|
start = set_start() |
|
|
|
labels = entities_list |
|
|
|
entities_result = model.predict_entities(text, labels) |
|
|
|
entities_dict = entities_list_to_dict(entities_list) |
|
for entity in entities_result: |
|
print(entity["label"], "=>", entity["text"]) |
|
entities_dict[entity["label"]] = entity["text"] |
|
|
|
audit_elapsedtime(function="Retreiving entity labels from text", start=start) |
|
return entities_dict |
|
|
|
def init_model_ner_v2(): |
|
print("Initiating NER model...") |
|
start = set_start() |
|
|
|
|
|
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") |
|
model = BertForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") |
|
|
|
audit_elapsedtime(function="Initiating NER model", start=start) |
|
return tokenizer, model |
|
|
|
def get_entity_results_v2(tokenizer, model, text: str, entities_list: List[str]): |
|
print("Initiating entity recognition...") |
|
start = set_start() |
|
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text))) |
|
labels = entities_list |
|
print("tokens line 24:",tokens) |
|
|
|
input_ids = tokenizer.encode(text, return_tensors="pt") |
|
print("input_ids line 27:",input_ids) |
|
|
|
with torch.no_grad(): |
|
outputs = model(input_ids) |
|
print("outputs line 31:",outputs) |
|
|
|
|
|
predicted_labels = torch.argmax(outputs.logits, dim=2)[0] |
|
print("predicted_labels line 35:",predicted_labels) |
|
|
|
|
|
entities = [] |
|
current_entity = "" |
|
for i, label_id in enumerate(predicted_labels): |
|
label = model.config.id2label[label_id.item()] |
|
print(f"i[{i}], label[{label}], label_id[{label_id}]") |
|
token = tokens[i] |
|
if label.startswith('B-'): |
|
print(token) |
|
if current_entity: |
|
entities.append(current_entity.strip()) |
|
current_entity = token |
|
elif label.startswith('I-'): |
|
print(token) |
|
current_entity += " " + token |
|
else: |
|
if current_entity: |
|
entities.append(current_entity.strip()) |
|
current_entity = "" |
|
|
|
|
|
filtered_entities = [entity for entity in entities if entity in labels] |
|
print("filtered_entities line 56:",filtered_entities) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audit_elapsedtime(function="Retreiving entity labels from text", start=start) |
|
return filtered_entities |