In [1]:
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer

from datasets import load_dataset
from pprint import pprint
from collections import Counter
import random
import evaluate
import numpy as np

import os
from huggingface_hub import login
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

In [3]:
# Define the checkpoint and get access to the huggingface token for uploading the model to huggingface hub
checkpoint = "bert-base-cased"
os.environ["HF_TOKEN"] = open(
    "/home/hf/hf-course/chapter7/hf-token.txt", "r").readlines()[0]

In [4]:
# Load the dataset
dataset = load_dataset("louisguitton/dev-ner-ontonotes")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'entities', 'entities-suggestion', 'entities-suggestion-metadata', 'external_id', 'metadata'],
        num_rows: 8528
    })
    validation: Dataset({
        features: ['text', 'entities', 'entities-suggestion', 'entities-suggestion-metadata', 'external_id', 'metadata'],
        num_rows: 8528
    })
})

In [5]:
# Have a look at one sample example in the dataset
pprint(dataset["train"].shuffle().take(1)[0])

{'entities': [],
 'entities-suggestion': {'end': [30],
                         'label': ['PERSON'],
                         'score': [1.0],
                         'start': [23],
                         'text': ['Camilla']},
 'entities-suggestion-metadata': {'agent': 'gold_labels',
                                  'score': None,
                                  'type': None},
 'external_id': None,
 'metadata': '{}',
 'text': 'The horse is basically Camilla /.'}


In [6]:
# Have a look at the distribution of all the labels
entity_types = []

for element in dataset["train"]:
    entity_types.extend(element["entities-suggestion"]["label"])

entities = sorted(set(entity_types))
final_entities = ["O"]
for entity in entities:
    final_entities.extend([f"B-{entity}", f"I-{entity}"])
print(final_entities)
print(Counter(entity_types))

['O', 'B-CARDINAL', 'I-CARDINAL', 'B-DATE', 'I-DATE', 'B-EVENT', 'I-EVENT', 'B-FAC', 'I-FAC', 'B-GPE', 'I-GPE', 'B-LANGUAGE', 'I-LANGUAGE', 'B-LAW', 'I-LAW', 'B-LOC', 'I-LOC', 'B-MONEY', 'I-MONEY', 'B-NORP', 'I-NORP', 'B-ORDINAL', 'I-ORDINAL', 'B-ORG', 'I-ORG', 'B-PERCENT', 'I-PERCENT', 'B-PERSON', 'I-PERSON', 'B-PRODUCT', 'I-PRODUCT', 'B-QUANTITY', 'I-QUANTITY', 'B-TIME', 'I-TIME', 'B-WORK_OF_ART', 'I-WORK_OF_ART']
Counter({'GPE': 2268, 'PERSON': 2020, 'ORG': 1740, 'DATE': 1507, 'CARDINAL': 938, 'NORP': 847, 'MONEY': 274, 'ORDINAL': 232, 'TIME': 214, 'LOC': 204, 'PERCENT': 177, 'EVENT': 143, 'WORK_OF_ART': 142, 'FAC': 115, 'QUANTITY': 100, 'PRODUCT': 72, 'LAW': 40, 'LANGUAGE': 33})


In [7]:
# Create a couple of dictionaries to map all the entities to integer ids and vice versa
id2label = {i: label for i, label in enumerate(final_entities)}
label2id = {v: k for k, v in id2label.items()}

In [8]:
# Create the tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [9]:
# Have a look at the tokenizer
pprint(tokenizer)

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [10]:
# Tokenize one sample and check what all is returned
output = tokenizer(dataset["train"][0]["text"], return_offsets_mapping=True)

In [11]:
output.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])

In [12]:
# Have a look at the entities
dataset["train"]["entities-suggestion"][0]

{'start': [2, 40, 53, 108, 122],
 'end': [9, 45, 56, 113, 137],
 'label': ['NORP', 'CARDINAL', 'CARDINAL', 'PRODUCT', 'LOC'],
 'text': ['Russian', 'three', '118', 'Kursk', 'the Barents Sea'],
 'score': [1.0, 1.0, 1.0, 1.0, 1.0]}

In [13]:
def in_span(source_start, source_end, target_start, target_end):
    """
    Function to check if the target span is contained within the source span
    """
    if (target_start >= source_start) and (target_end <= source_end):
        return True
    return False


def tokenize_and_create_labels(example):
    """
    Function to tokenize the example and subsequently create labels. The labels provided will not be aligned with the tokens (after wordpiece tokenization); hence this step.
    """
    outputs = tokenizer(
        example["text"], truncation=True, return_offsets_mapping=True)

    output_labels = []
    n_samples = len(example["text"])

    # Do for all the samples in the batch
    for i in range(n_samples):
        # Do not take the first and last offsets as they belong to a special token (CLS and SEP respectively)
        offsets = outputs["offset_mapping"][i][1:-1]
        num_tokens = len(offsets)

        # Entity spans
        entity_starts = example["entities-suggestion"][i]["start"]
        entity_ends = example["entities-suggestion"][i]["end"]

        # Labels and their number
        text_labels = example["entities-suggestion"][i]["label"]
        num_entities = len(text_labels)

        labels = []

        entities = example["entities-suggestion"][i]

        # If there are no spans, it will all be a list of Os
        if len(entities["start"]) == 0:
            labels = [label2id["O"] for _ in range(num_tokens)]
        # Otherwise check span by span
        else:
            idx = 0
            source_start, source_end = entity_starts[idx], entity_ends[idx]
            previous_label = "O"

            for loop_idx, (start, end) in enumerate(offsets):
                # By default, the token is an O token
                lab = "O"

                # While you have not exceeded the number of identities provided
                if idx < num_entities:
                    # While you have not stepped ahead of the next identity span
                    if start > source_end:
                        # If you have reached the end of the identities annotated, simply fill in the remainder of the tokens as O
                        if idx == num_entities - 1:
                            lab = "O"
                            remainder = [
                                label2id["O"] for _ in range(num_tokens - loop_idx)
                            ]
                            labels.extend(remainder)
                            break
                        else:
                            idx += 1

                    # If the idx is refreshed, then consider new span
                    source_start, source_end = entity_starts[idx], entity_ends[idx]

                    # Check if current token is within the source span
                    if in_span(source_start, source_end, start, end):
                        # Check if the previous label was an O, if so then this one would begin with a B- else an I-
                        lab = "B-" if previous_label == "O" else "I-"
                        lab = lab + text_labels[idx]
                    else:
                        lab = "O"

                labels.append(label2id[lab])
                previous_label = lab
        # The first and last tokens are reserved for special words [CLS] and [SEP], hence modify their indices accordingly
        output_labels.append([-100] + labels + [-100])
    outputs["labels"] = output_labels

    return outputs

In [14]:
tokenized_dataset = dataset.map(tokenize_and_create_labels, batched=True,
                                remove_columns=dataset["train"].column_names)

In [15]:
# Create a sample of 5 items for the sake of visualization
samples = dataset["train"].shuffle(seed=43).take(5).map(
    tokenize_and_create_labels, batched=True)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [16]:
# Visualize a few samples from the dataset randomly
idx = random.randint(0, len(samples))

ip_tokens = [tokenizer.decode([x]) for x in samples[idx]["input_ids"]]
labels = samples[idx]["labels"]

token_op, lbl_op = "", ""
for token, lbl in zip(ip_tokens, labels):
    lbl = id2label.get(lbl, "SPECIAL")
    l = max(len(token), len(lbl)) + 2
    token_op += f"{token:<{l}}"
    lbl_op += f"{lbl:<{l}}"

print(token_op)
print(lbl_op)
print(f"Number of tokens: {len(ip_tokens)}, Number of Labels:  {len(labels)}")
print("Entities Annotated: ", samples[idx]["entities-suggestion"])

[CLS]    An  easy  but  rare  maneuver  with  extraordinary  consequences  /  .  [SEP]    
SPECIAL  O   O     O    O     O         O     O              O             O  O  SPECIAL  
Number of tokens: 12, Number of Labels:  12
Entities Annotated:  {'start': [], 'end': [], 'label': [], 'text': [], 'score': []}


In [17]:
# We need to remove the offset mappings as it would not be possible to colalte data without dropping this column
tokenized_dataset = tokenized_dataset.remove_columns(
    column_names=["offset_mapping"])

In [18]:
# Create a data collator to apply padding as and when necessary and have a look at the working of the same
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([tokenized_dataset["train"][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,   19,    0,    0,    0,    0,    0,    0,    1,    0,    0,
            1,    0,    0,    0,    0,    0,    0,    0,    0,   29,   30,    0,
            0,   15,   16,   16,   16,    0, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,   19,    0,   19,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0, -100, -100, -100, -100, -100]])

In [20]:
metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds

    # Get the most probable token prediction
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels, true_predictions = [], []
    for prediction, label in zip(predictions, labels):
        current_prediction, current_label = [], []
        for p, l in zip(prediction, label):
            if l != -100:
                current_label.append(id2label[l])
                current_prediction.append(id2label[p])
        true_labels.append(current_label)
        true_predictions.append(current_prediction)

    # Compute the metrics using above predictions and labels
    all_metrics = metric.compute(
        predictions=true_predictions, references=true_labels)

    # Return the overall metrics and not individual level metrics
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [21]:
# Create a model for token classification on top of pretrained BERT model
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cl

In [22]:
# Check the classifier architecture
model.classifier

Linear(in_features=768, out_features=37, bias=True)

In [23]:
# Have a look at the number of labels, the number of ids created for those labels and the number of activations in the final layer of the model
model.config.num_labels, len(label2id), len(id2label)

(37, 37, 37)

In [24]:
# Login to huggingface for uploading the generated model
login(token=os.environ.get("HF_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/.cache/huggingface/token
Login successful


In [27]:
args = TrainingArguments(
    "dev-ner-ontonote-bert-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32
)

In [28]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8528
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8528
    })
})

In [29]:
trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/home/hf/hf-course/chapter7/dev-ner-ontonote-bert-finetuned is already a clone of https://huggingface.co/ElisonSherton/dev-ner-ontonote-bert-finetuned. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.111329,0.757552,0.797257,0.776898,0.968852
2,0.281100,0.055888,0.873178,0.908711,0.89059,0.984724
3,0.281100,0.035979,0.914701,0.94777,0.930942,0.990416
4,0.063000,0.027458,0.933327,0.960033,0.946492,0.992793
5,0.063000,0.024083,0.940449,0.966845,0.953464,0.993742


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1335, training_loss=0.1388676861252231, metrics={'train_runtime': 562.8544, 'train_samples_per_second': 75.757, 'train_steps_per_second': 2.372, 'total_flos': 1425922860395136.0, 'train_loss': 0.1388676861252231, 'epoch': 5.0})

In [30]:
trainer.push_to_hub(
    commit_message="ðŸ¤— Training of first BERT based NER task completed!!")

To https://huggingface.co/ElisonSherton/dev-ner-ontonote-bert-finetuned
   41c8386..27067f9  main -> main

