scandeng-tokenizer / convert.sh
versae's picture
Add HF tokenizer converted from SentencePiece
04b8476
# For models trained with SentencePiece with byte_fallback for autoregressive models
# ./spm_train --vocab_size 32000 --character_coverage 1.0 --hard_vocab_limit --model_type bpe --pad_id 3 --shuffle_input_sentence true --model_prefix ./sentencepiece.model --byte_fallback=true --input text.txt --input_sentence_size=100000 --num_threads 8
wget -O sentencepiece_extractor.py https://raw.githubusercontent.com/huggingface/tokenizers/master/bindings/python/scripts/sentencepiece_extractor.py
python sentencepiece_extractor.py --provider sentencepiece --model sentencepiece.model --merges-output-path ./merges.txt --vocab-output-path ./vocab.json
python <<EOF
from transformers import AutoTokenizer
from tokenizers import SentencePieceBPETokenizer
SentencePieceBPETokenizer.from_file("./vocab.json", "./merges.txt")
tokenizer = SentencePieceBPETokenizer.from_file("./vocab.json", "./merges.txt")
tokenizer.model.byte_fallback=True
tokenizer.model.fuse_unk=True
tokenizer.save("./tokenizer.json")
htok = AutoTokenizer.from_pretrained("./")
htok.padding_side = "right"
htok.save_pretrained("./")
EOF