DRAFT: Add a fast tokenizer implementation and converter

#11
by chielo - opened

Usage

import transformers, tokenizers

transformers.__version__, tokenizers.__version__
# >>> ('4.35.0', '0.14.1')

from transformers import AutoTokenizer

fast_tokenizer = AutoTokenizer.from_pretrained(
    "./", trust_remote_code=True, use_fast=True
)
slow_tokenizer = AutoTokenizer.from_pretrained(
    "./", trust_remote_code=True, use_fast=False
)

content = "是哪个星球的呢?"
history = [
    {
        "role": "user",
        "content": "这是什么语言?“aburaka    dabura   ”",
        "metadata": {"username": "Chielo"},
    },
    {"role": "assistant", "content": "这是来自外星的语言。"},
]

old_inputs = slow_tokenizer.build_chat_input(content, history=history)
new_inputs = fast_tokenizer.build_chat_input(content, history=history)
new_text = fast_tokenizer.build_chat_text(content, history=history)

old_input_ids = old_inputs["input_ids"][0].tolist()

new_inputs["input_ids"][0].tolist() == old_input_ids
# >>> True

fast_tokenizer.encode(new_text) == old_input_ids
# >>> True

fast_tokenizer.decode(old_input_ids) == slow_tokenizer.decode(old_input_ids)
# >>> True

new_text
# >>> "<|user|><!encode-sep!>{'username': 'Chielo'}\n<!encode-sep!>这是什么语言?“aburaka    dabura   ”<|assistant|><!encode-sep!>\n<!encode-sep!>这是来自外星的语言。<|user|><!encode-sep!>\n<!encode-sep!>是哪个星球的呢?<|assistant|>"
chielo changed pull request title from Add a fast tokenizer implementation and converter to DRAFT: Add a fast tokenizer implementation and converter
chielo changed pull request status to closed
chielo changed pull request status to open
chielo changed pull request status to closed

Sign up or log in to comment