Cristian283 commited on
Commit
bfb1bd0
1 Parent(s): b1ee364
Files changed (1) hide show
  1. J +9 -0
J ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
2
+
3
+ tokenizer = Tokenizer(models.BPE())
4
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
5
+ tokenizer.decoder = decoders.ByteLevel()
6
+ tokenizer.post_processor = processors.ByteLevel()
7
+
8
+ trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[MASK2]"])
9
+ tokenizer.train(["archivo_texto.txt"], trainer)