|
--- |
|
inference: false |
|
language: pt |
|
datasets: |
|
- lener_br |
|
license: mit |
|
pipeline_tag: token-classification |
|
--- |
|
|
|
# DeBERTinha XSmall for NER |
|
|
|
|
|
## Full Token Classification Example |
|
|
|
```python |
|
from transformers import AutoModelForTokenClassification, AutoTokenizer, AutoConfig |
|
import torch |
|
|
|
model_name = "sagui-nlp/debertinha-ptbr-xsmall-lenerbr" |
|
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=13) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
input_text = "Acrescento que não há de se falar em violação do artigo 114, § 3º, da Constituição Federal, posto que referido dispositivo revela-se impertinente, tratando da possibilidade de ajuizamento de dissídio coletivo pelo Ministério Público do Trabalho nos casos de greve em atividade essencial." |
|
|
|
inputs = tokenizer(input_text, max_length=512, truncation=True, return_tensors="pt") |
|
tokens = inputs.tokens() |
|
|
|
outputs = model(**inputs).logits |
|
predictions = torch.argmax(outputs, dim=2) |
|
|
|
entities = [] |
|
current_entity = [] |
|
current_label = None |
|
for token, prediction in zip(tokens[1:-1], predictions[0].numpy()[1:-1]): |
|
# print((token, model.config.id2label[prediction])) |
|
if not len(current_entity): |
|
current_entity.append(token) |
|
current_label = model.config.id2label[prediction] |
|
elif token.startswith("▁"): |
|
entities.append(("".join(current_entity), current_label)) |
|
current_entity = [token] |
|
current_label = model.config.id2label[prediction] |
|
else: |
|
current_entity.append(token) |
|
entities.append(("".join(current_entity), current_label)) |
|
list(filter(lambda x:x[1]!="O", entities)) |
|
``` |
|
|
|
## Training notes |
|
Training was done on label of only the first token |
|
```python |
|
label_all_tokens = False |
|
task="ner" |
|
|
|
def tokenize_and_align_labels(examples): |
|
tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, max_length=512) |
|
|
|
labels = [] |
|
for i, label in enumerate(examples[f"{task}_tags"]): |
|
word_ids = tokenized_inputs.word_ids(batch_index=i) |
|
previous_word_idx = None |
|
label_ids = [] |
|
for word_idx in word_ids: |
|
# Special tokens have a word id that is None. We set the label to -100 so they are automatically |
|
# ignored in the loss function. |
|
if word_idx is None: |
|
label_ids.append(-100) |
|
# We set the label for the first token of each word. |
|
elif word_idx != previous_word_idx: |
|
label_ids.append(label[word_idx]) |
|
# For the other tokens in a word, we set the label to either the current label or -100, depending on |
|
# the label_all_tokens flag. |
|
else: |
|
label_ids.append(label[word_idx] if label_all_tokens else -100) |
|
previous_word_idx = word_idx |
|
|
|
labels.append(label_ids) |
|
|
|
tokenized_inputs["labels"] = labels |
|
return tokenized_inputs |
|
|
|
dataset = dataset.map(tokenize_and_align_labels, batched=True) |
|
``` |
|
|
|
## Citation |
|
|
|
``` |
|
@misc{campiotti2023debertinha, |
|
title={DeBERTinha: A Multistep Approach to Adapt DebertaV3 XSmall for Brazilian Portuguese Natural Language Processing Task}, |
|
author={Israel Campiotti and Matheus Rodrigues and Yuri Albuquerque and Rafael Azevedo and Alyson Andrade}, |
|
year={2023}, |
|
eprint={2309.16844}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL} |
|
} |
|
``` |