File size: 3,189 Bytes
c043657 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
"""
Various utilities needed for the presentation.
"""
from datasets import DatasetDict
from transformers import PreTrainedTokenizerBase
from typing import Callable, Dict, List
def tokenize(tokenizer: PreTrainedTokenizerBase,
end_char: str = '\n') -> Callable[[Dict[str, List[str]]], DatasetDict]:
"""
Helper function that returns a function to use with the `map` method of
datasets.DatasetDict. It takes a tokenizer and generates a function that
applies that tokenizer with an optional `end_char` parameter (e.g. a
newline) that might be needed (e.g. when trying to tokenize and keep the
structure of a poem wich needs the newline after each sentence). This is
needed since the function `datasets.load_dataset` forcibly removes the
newlines characters.
Parameters
----------
tokenizer : PreTrainedTokenizerBase
The tokenizer to use for the tokenization process.
end_char : str
The end character to append to each line.
Returns
-------
Callable[[Dict[str, List[str]]], DatasetDict]
The function in charge of the tokenization process.
"""
def _tokenize(examples: Dict[str, List[str]]) -> DatasetDict:
return tokenizer([f'{e}{end_char}' for e in examples['text']])
return _tokenize
def group_texts(examples: Dict[str, List[int]],
block_size: int = 128) -> Dict[str, List[int]]:
"""
Helper function to concatenate a tokenized dataset (with the function above)
in chunks of `block_size`. The code was taken from
https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb
Parameters
----------
examples : Dict[str, List[int]]
This is actually a `LazyBatch` from the transformers library, that is
given by the `DatasetDict.map` method. It should be the dataset returned
after tokenization with the function returned by `tokenize`. It should
have 2 main keys: 'input_ids' and 'attention_mask'.
block_size : int
The size of the block to use in the training process. If the total lenght
of the group of texts is not divisible by the block size it will ignore the
remaining data for simplicity.
Returns
-------
Dict[str, List[str, int]]
The dictionary that will provide the new dataset divided in chunks of
`block_size`.
"""
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples['input_ids'])
# We drop the small remainder, we could add padding if the model supported
# it instead of this drop, you can customize this part to your needs
total_length = (total_length // block_size) * block_size
# Split by chunks of block_size length
result = {
k: [t[i:i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
# labels to be used by the training phase, it copies since the Transformers
# library will be in charge of making the shift to the right
result["labels"] = result["input_ids"].copy()
return result
|