Cristian283
commited on
Commit
•
bfb1bd0
1
Parent(s):
b1ee364
Create J
Browse files
J
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
|
2 |
+
|
3 |
+
tokenizer = Tokenizer(models.BPE())
|
4 |
+
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
5 |
+
tokenizer.decoder = decoders.ByteLevel()
|
6 |
+
tokenizer.post_processor = processors.ByteLevel()
|
7 |
+
|
8 |
+
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[MASK2]"])
|
9 |
+
tokenizer.train(["archivo_texto.txt"], trainer)
|