File size: 414 Bytes
bfb1bd0
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors

tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()

trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[MASK2]"])
tokenizer.train(["archivo_texto.txt"], trainer)