from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors | |
tokenizer = Tokenizer(models.BPE()) | |
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() | |
tokenizer.decoder = decoders.ByteLevel() | |
tokenizer.post_processor = processors.ByteLevel() | |
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[MASK2]"]) | |
tokenizer.train(["archivo_texto.txt"], trainer) | |