|
""" |
|
Various utilities needed for the presentation. |
|
""" |
|
|
|
from datasets import DatasetDict |
|
from transformers import PreTrainedTokenizerBase |
|
from typing import Callable, Dict, List |
|
|
|
|
|
def tokenize(tokenizer: PreTrainedTokenizerBase, |
|
end_char: str = '\n') -> Callable[[Dict[str, List[str]]], DatasetDict]: |
|
""" |
|
Helper function that returns a function to use with the `map` method of |
|
datasets.DatasetDict. It takes a tokenizer and generates a function that |
|
applies that tokenizer with an optional `end_char` parameter (e.g. a |
|
newline) that might be needed (e.g. when trying to tokenize and keep the |
|
structure of a poem wich needs the newline after each sentence). This is |
|
needed since the function `datasets.load_dataset` forcibly removes the |
|
newlines characters. |
|
|
|
Parameters |
|
---------- |
|
tokenizer : PreTrainedTokenizerBase |
|
The tokenizer to use for the tokenization process. |
|
end_char : str |
|
The end character to append to each line. |
|
|
|
Returns |
|
------- |
|
Callable[[Dict[str, List[str]]], DatasetDict] |
|
The function in charge of the tokenization process. |
|
|
|
""" |
|
def _tokenize(examples: Dict[str, List[str]]) -> DatasetDict: |
|
return tokenizer([f'{e}{end_char}' for e in examples['text']]) |
|
|
|
return _tokenize |
|
|
|
|
|
def group_texts(examples: Dict[str, List[int]], |
|
block_size: int = 128) -> Dict[str, List[int]]: |
|
""" |
|
Helper function to concatenate a tokenized dataset (with the function above) |
|
in chunks of `block_size`. The code was taken from |
|
https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb |
|
|
|
Parameters |
|
---------- |
|
examples : Dict[str, List[int]] |
|
This is actually a `LazyBatch` from the transformers library, that is |
|
given by the `DatasetDict.map` method. It should be the dataset returned |
|
after tokenization with the function returned by `tokenize`. It should |
|
have 2 main keys: 'input_ids' and 'attention_mask'. |
|
block_size : int |
|
The size of the block to use in the training process. If the total lenght |
|
of the group of texts is not divisible by the block size it will ignore the |
|
remaining data for simplicity. |
|
|
|
Returns |
|
------- |
|
Dict[str, List[str, int]] |
|
The dictionary that will provide the new dataset divided in chunks of |
|
`block_size`. |
|
""" |
|
|
|
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} |
|
total_length = len(concatenated_examples['input_ids']) |
|
|
|
|
|
total_length = (total_length // block_size) * block_size |
|
|
|
result = { |
|
k: [t[i:i + block_size] for i in range(0, total_length, block_size)] |
|
for k, t in concatenated_examples.items() |
|
} |
|
|
|
|
|
result["labels"] = result["input_ids"].copy() |
|
return result |
|
|