|
from torch.utils.data import DataLoader, Dataset |
|
import torch |
|
from transformers import BertTokenizerFast, BertModel |
|
from transformers import BertConfig, BertPreTrainedModel |
|
import numpy as np |
|
from typing import Dict, List, Union, Tuple |
|
from utils import ner_labels_to_ids, intent_labels_to_ids, structure_data |
|
|
|
class tokenized_dataset(Dataset): |
|
""" |
|
A Pytorch Dataset for tokenizing and encoding text data for a BERT-based model. |
|
|
|
Args: |
|
dataset (dict): A dictionary containing 'text', 'ner', and 'intent' keys. |
|
tokenizer (BertTokenizerFast): A tokenizer for processing text input. |
|
max_len (int, optionl): Maximum length of tokenized sequences (default: 128). |
|
|
|
Attributes: |
|
len (int): Number of samples in the dataset. |
|
|
|
Methods: |
|
__getitem__(self, index: int) -> Dict[str, torch.Tensor]: |
|
Retrieve and preprocess a single sample from the dataset. |
|
|
|
__len__(self) -> int: |
|
Get the total number of samples int the dataset. |
|
|
|
Returns: |
|
Dict[str, torch.Tensor]: A dictionary containing tokenized and encoded text, NER and intent labels. |
|
""" |
|
def __init__(self, dataset: Dict[str, List[str]], tokenizer: BertTokenizerFast, max_len: int = 128): |
|
self.len = len(dataset['text']) |
|
self.ner_labels_to_ids = ner_labels_to_ids() |
|
self.intent_labels_to_ids = intent_labels_to_ids() |
|
self.text = dataset['text'] |
|
self.intent = dataset['intent'] |
|
self.ner = dataset['entities'] |
|
self.tokenizer = tokenizer() |
|
self.max_len = max_len |
|
|
|
def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: |
|
|
|
sentence = self.text[index].strip() |
|
intent_label = self.intent[index].strip() |
|
ner_labels = self.ner[index] |
|
|
|
|
|
|
|
encoding = self.tokenizer( |
|
sentence, |
|
return_offsets_mapping=True, |
|
padding='max_length', |
|
truncation=True, |
|
max_length=self.max_len |
|
) |
|
|
|
|
|
tokenized_ner_labels = [self.ner_labels_to_ids[label] for label in ner_labels] |
|
|
|
encoded_ner_labels = np.ones(len(encoding['offset_mapping']), dtype=int) * -100 |
|
|
|
|
|
i = 0 |
|
prev = -1 |
|
for idx, mapping in enumerate(encoding['offset_mapping']): |
|
if mapping[0] == mapping[1] == 0: |
|
continue |
|
if mapping[0] != prev: |
|
|
|
encoded_ner_labels[idx] = tokenized_ner_labels[i] |
|
prev = mapping[1] |
|
i += 1 |
|
else: |
|
prev = mapping[1] |
|
|
|
|
|
tokenized_intent_label = self.intent_labels_to_ids[intent_label] |
|
|
|
|
|
item = {key: torch.as_tensor(val) for key, val in encoding.items()} |
|
item['ner_labels'] = torch.as_tensor(encoded_ner_labels) |
|
item['intent_labels'] = torch.as_tensor(tokenized_intent_label) |
|
|
|
return item |
|
|
|
def __len__(self) -> int: |
|
return self.len |
|
|
|
|