tokenizer-arena / playground_examples.py
xu-song's picture
update
6ef6bf4
raw
history blame
4.16 kB
"""
## characters
- alphanumeric characters
- numeric characters
- special characters: A special character is a character that is not an alphabetic or numeric character.
- ASCII control characters
- punctuation marks
- accent marks
- 数学符号
- whitespace:
- https://en.wikipedia.org/wiki/Whitespace_character
- https://emptycharacter.com/
https://www.computerhope.com/jargon/s/specchar.htm
"""
import random
from datasets import load_dataset
default_user_input = """\
Replace this text in the input field to see how tokenization works.
Buenos días!
华为发布Mate60手机。
ラグビーワールドカップ2023フランス"""
# default_tokenizer_name_1 = "Meta/llama3"
default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
default_tokenizer_name_2 = "openai/gpt-4o"
def get_sample_input():
default_inputs = {
"en": "Replace this text in the input field to see how tokenization works.",
"zh-Hans": "",
"es": "",
"de": "",
}
random.seed(10) # For reproducibility
lines = []
for lang in default_inputs.keys():
dataset = load_dataset("eson/cc100-samples", lang, split="train")
print(dataset)
print(1)
return default_inputs
examples = {
"en": [
["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"], #
["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "huggyllama/llama-7b", "google-bert/bert-base-cased"], # chatglm 有blank_n, bert丢掉了空格,
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "google/gemma-7b", "huggyllama/llama-7b"], # llama词典有点小
["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan-inc/Baichuan-7B", "huggyllama/llama-7b"],
# ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
],
"zh": [
["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
["标点测试:,。!?;", "baichuan_7b", "llama"],
["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
]
}
more_examples = [
# bert系列
("google-bert/bert-base-cased", "google-bert/bert-base-uncased", "", ""), # # clue VS kplug, bert VS clue
("bert-base-cased", "clue", "", "增加了[]()"),
("roberta-chinese-clue", "kplug", "", ""),
# llama系列 (基于sentencepiece)
("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"),
("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"),
("llama", "chinese-llama-2-7b", ""),
("llama", "llama3", "扩充词典"),
("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
# glm系列 (基于sentencepiece)
("glm", "chatglm1", ""),
("chatglm1", "chatglm2", ""),
# gpt2系列
("gpt2", "moss", ""),
("", "", ""),
# openai系列 (tiktoken)
("qwen", "gpt_35_turbo", ""),
]
lang = "en"
example_types = [t[0].split(":")[0] for t in examples[lang]]
def example_fn(example_idx):
return examples[lang][example_idx]
def get_more_example():
import urllib.parse
url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
for tokenizer1, tokenizer2, text, comment in more_examples:
full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}'
print(full_url)
if __name__ == "__main__":
get_more_example()