{
"FacebookAI/xlm-roberta-base": {
"tokenizer": "xlm-roberta-base",
"organization": "Facebook",
"vocab_size": 250002,
"num(digit)": 2728,
"len(digit)": "1,3,9",
"num(space)": 1,
"len(space)": "1,1,1",
"num(ar)": 14644,
"len(ar)": "1,4,16",
"num(zh)": 18457,
"len(zh)": "1,2,16",
"num(ja)": 20572,
"len(ja)": "1,2,16",
"num(ja-kana)": 3434,
"len(ja-kana)": "1,3,12",
"num(ko)": 5373,
"len(ko)": "1,2,8"
},
"clue/roberta_chinese_clue_tiny": {
"tokenizer": "roberta-chinese-clue",
"organization": "CLUE",
"vocab_size": 8021,
"num(digit)": 230,
"len(digit)": "1,4,10",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 30,
"len(ar)": "1,2,3",
"num(zh)": 5689,
"len(zh)": "1,1,1",
"num(ja)": 5691,
"len(ja)": "1,1,3",
"num(ja-kana)": 0,
"len(ja-kana)": "-",
"num(ko)": 0,
"len(ko)": "-"
},
"dbmdz/bert-base-german-uncased": {
"tokenizer": "bert-base-german-uncased",
"organization": "dbmdz",
"vocab_size": 31102,
"num(digit)": 1733,
"len(digit)": "1,4,12",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 0,
"len(ar)": "-",
"num(zh)": 0,
"len(zh)": "-",
"num(ja)": 0,
"len(ja)": "-",
"num(ja-kana)": 0,
"len(ja-kana)": "-",
"num(ko)": 0,
"len(ko)": "-"
},
"google-bert/bert-base-cased": {
"tokenizer": "bert-base-cased",
"organization": "Google",
"vocab_size": 28996,
"num(digit)": 926,
"len(digit)": "1,4,11",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 94,
"len(ar)": "1,3,4",
"num(zh)": 226,
"len(zh)": "1,2,3",
"num(ja)": 390,
"len(ja)": "1,2,3",
"num(ja-kana)": 164,
"len(ja-kana)": "1,2,3",
"num(ko)": 10,
"len(ko)": "1,2,3"
},
"google-bert/bert-base-chinese": {
"tokenizer": "bert-base-chinese",
"organization": "Google",
"vocab_size": 21128,
"num(digit)": 1451,
"len(digit)": "1,3,12",
"num(space)": 2,
"len(space)": "1,2,3",
"num(ar)": 30,
"len(ar)": "1,2,3",
"num(zh)": 14642,
"len(zh)": "1,2,3",
"num(ja)": 15197,
"len(ja)": "1,3,15",
"num(ja-kana)": 553,
"len(ja-kana)": "1,3,15",
"num(ko)": 0,
"len(ko)": "-"
},
"google-bert/bert-base-german-cased": {
"tokenizer": "bert-base-german-cased",
"organization": "Google",
"vocab_size": 30000,
"num(digit)": 4065,
"len(digit)": "1,11,22",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 0,
"len(ar)": "-",
"num(zh)": 0,
"len(zh)": "-",
"num(ja)": 0,
"len(ja)": "-",
"num(ja-kana)": 0,
"len(ja-kana)": "-",
"num(ko)": 0,
"len(ko)": "-"
},
"google-bert/bert-base-multilingual-cased": {
"tokenizer": "bert-base-multilingual-cased",
"organization": "Google",
"vocab_size": 119547,
"num(digit)": 2583,
"len(digit)": "1,3,13",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 4873,
"len(ar)": "1,5,14",
"num(zh)": 13542,
"len(zh)": "1,2,3",
"num(ja)": 14880,
"len(ja)": "1,3,10",
"num(ja-kana)": 1336,
"len(ja-kana)": "1,4,10",
"num(ko)": 3271,
"len(ko)": "1,3,6"
},
"google-bert/bert-base-multilingual-uncased": {
"tokenizer": "bert-base-multilingual-uncased",
"organization": "Google",
"vocab_size": 105879,
"num(digit)": 2510,
"len(digit)": "1,3,13",
"num(space)": 2,
"len(space)": "1,2,3",
"num(ar)": 4530,
"len(ar)": "1,5,13",
"num(zh)": 16658,
"len(zh)": "1,2,3",
"num(ja)": 17858,
"len(ja)": "1,3,10",
"num(ja-kana)": 1188,
"len(ja-kana)": "1,4,10",
"num(ko)": 0,
"len(ko)": "-"
},
"google-bert/bert-base-uncased": {
"tokenizer": "bert-base-uncased",
"organization": "Google",
"vocab_size": 30522,
"num(digit)": 2056,
"len(digit)": "1,4,11",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 88,
"len(ar)": "1,3,5",
"num(zh)": 488,
"len(zh)": "1,2,3",
"num(ja)": 676,
"len(ja)": "1,2,3",
"num(ja-kana)": 188,
"len(ja-kana)": "1,2,3",
"num(ko)": 0,
"len(ko)": "-"
},
"google/mobilebert-uncased": {
"tokenizer": "mobilebert-uncased",
"organization": "Google",
"vocab_size": 30522,
"num(digit)": 2056,
"len(digit)": "1,4,11",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 88,
"len(ar)": "1,3,5",
"num(zh)": 488,
"len(zh)": "1,2,3",
"num(ja)": 676,
"len(ja)": "1,2,3",
"num(ja-kana)": 188,
"len(ja-kana)": "1,2,3",
"num(ko)": 0,
"len(ko)": "-"
},
"tohoku-nlp/bert-base-japanese": {
"tokenizer": "bert-base-japanese",
"organization": "Tohoku",
"vocab_size": 32000,
"num(digit)": 669,
"len(digit)": "1,3,5",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 10,
"len(ar)": "1,3,3",
"num(zh)": 18792,
"len(zh)": "1,2,11",
"num(ja)": 28367,
"len(ja)": "1,2,13",
"num(ja-kana)": 12359,
"len(ja-kana)": "1,4,13",
"num(ko)": 0,
"len(ko)": "-"
},
"gpt-4": {
"tokenizer": "gpt-4",
"organization": "OpenAI",
"vocab_size": 100277,
"num(digit)": 1110,
"len(digit)": "1,3,3",
"num(space)": 47472,
"len(space)": "1,7,128",
"num(ar)": 113,
"len(ar)": "1,2,10",
"num(zh)": 868,
"len(zh)": "1,1,7",
"num(ja)": 1035,
"len(ja)": "1,1,7",
"num(ja-kana)": 169,
"len(ja-kana)": "1,1,7",
"num(ko)": 299,
"len(ko)": "1,2,4"
},
"llama3": {
"tokenizer": "llama3",
"organization": "Meta",
"vocab_size": 128256,
"num(digit)": 1110,
"len(digit)": "1,3,3",
"num(space)": 60860,
"len(space)": "1,6,128",
"num(ar)": 3810,
"len(ar)": "1,4,11",
"num(zh)": 4424,
"len(zh)": "1,1,7",
"num(ja)": 5387,
"len(ja)": "1,2,8",
"num(ja-kana)": 1086,
"len(ja-kana)": "1,2,8",
"num(ko)": 2281,
"len(ko)": "1,2,6"
},
"google-t5/t5-large": {
"tokenizer": "t5",
"organization": "Google",
"vocab_size": 32100,
"num(digit)": 1133,
"len(digit)": "1,3,13",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 0,
"len(ar)": "-",
"num(zh)": 0,
"len(zh)": "-",
"num(ja)": 0,
"len(ja)": "-",
"num(ja-kana)": 0,
"len(ja-kana)": "-",
"num(ko)": 0,
"len(ko)": "-"
},
"google/byt5-small": {
"tokenizer": "byt5-small",
"organization": "Google",
"vocab_size": 384,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 10,
"len(space)": "1,1,1",
"num(ar)": 0,
"len(ar)": "-",
"num(zh)": 0,
"len(zh)": "-",
"num(ja)": 0,
"len(ja)": "-",
"num(ja-kana)": 0,
"len(ja-kana)": "-",
"num(ko)": 0,
"len(ko)": "-"
},
"google/mt5-large": {
"tokenizer": "mt5-large",
"organization": "Google",
"vocab_size": 250100,
"num(digit)": 16829,
"len(digit)": "1,4,16",
"num(space)": 1,
"len(space)": "1,1,1",
"num(ar)": 7459,
"len(ar)": "1,3,16",
"num(zh)": 21489,
"len(zh)": "1,2,16",
"num(ja)": 27078,
"len(ja)": "1,2,16",
"num(ja-kana)": 9160,
"len(ja-kana)": "1,3,14",
"num(ko)": 4041,
"len(ko)": "1,1,10"
},
"lmsys/fastchat-t5-3b-v1.0": {
"tokenizer": "fastchat-t5-3b-v1.0",
"organization": "LMSYS",
"vocab_size": 32110,
"num(digit)": 1033,
"len(digit)": "1,3,8",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 0,
"len(ar)": "-",
"num(zh)": 0,
"len(zh)": "-",
"num(ja)": 0,
"len(ja)": "-",
"num(ja-kana)": 0,
"len(ja-kana)": "-",
"num(ko)": 0,
"len(ko)": "-"
},
"paust/pko-t5-large": {
"tokenizer": "pko-t5-large",
"organization": "PAUST",
"vocab_size": 50358,
"num(digit)": 51,
"len(digit)": "1,2,3",
"num(space)": 10,
"len(space)": "1,1,1",
"num(ar)": 0,
"len(ar)": "-",
"num(zh)": 0,
"len(zh)": "-",
"num(ja)": 0,
"len(ja)": "-",
"num(ja-kana)": 0,
"len(ja-kana)": "-",
"num(ko)": 49050,
"len(ko)": "1,2,16"
},
"bloom": {
"tokenizer": "bloom",
"organization": "BigScience",
"vocab_size": 250680,
"num(digit)": 6629,
"len(digit)": "1,4,50",
"num(space)": 140180,
"len(space)": "1,6,600",
"num(ar)": 20854,
"len(ar)": "1,5,16",
"num(zh)": 30603,
"len(zh)": "1,2,23",
"num(ja)": 30816,
"len(ja)": "1,2,23",
"num(ja-kana)": 214,
"len(ja-kana)": "1,1,3",
"num(ko)": 338,
"len(ko)": "1,1,3"
},
"llama": {
"tokenizer": "llama",
"organization": "Meta",
"vocab_size": 32000,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 700,
"len(zh)": "1,1,1",
"num(ja)": 837,
"len(ja)": "1,1,1",
"num(ja-kana)": 137,
"len(ja-kana)": "1,1,1",
"num(ko)": 111,
"len(ko)": "1,1,1"
},
"ClueAI/ChatYuan-large-v2": {
"tokenizer": "ChatYuan-large-v2",
"organization": "CLUE",
"vocab_size": 32128,
"num(digit)": 740,
"len(digit)": "1,3,9",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 2,
"len(ar)": "1,1,1",
"num(zh)": 29591,
"len(zh)": "1,2,16",
"num(ja)": 29736,
"len(ja)": "1,2,16",
"num(ja-kana)": 145,
"len(ja-kana)": "1,1,2",
"num(ko)": 0,
"len(ko)": "-"
},
"Meta/llama3": {
"tokenizer": "llama3",
"organization": "Meta",
"vocab_size": 128256,
"num(digit)": 1110,
"len(digit)": "1,3,3",
"num(space)": 60860,
"len(space)": "1,6,128",
"num(ar)": 3810,
"len(ar)": "1,4,11",
"num(zh)": 4424,
"len(zh)": "1,1,7",
"num(ja)": 5387,
"len(ja)": "1,2,8",
"num(ja-kana)": 1086,
"len(ja-kana)": "1,2,8",
"num(ko)": 2281,
"len(ko)": "1,2,6"
},
"openai/gpt-4": {
"tokenizer": "gpt-4",
"organization": "OpenAI",
"vocab_size": 100277,
"num(digit)": 1110,
"len(digit)": "1,3,3",
"num(space)": 47472,
"len(space)": "1,7,128",
"num(ar)": 113,
"len(ar)": "1,2,10",
"num(zh)": 868,
"len(zh)": "1,1,7",
"num(ja)": 1035,
"len(ja)": "1,1,7",
"num(ja-kana)": 169,
"len(ja-kana)": "1,1,7",
"num(ko)": 299,
"len(ko)": "1,2,4"
},
"gradientai/Llama-3-8B-Instruct-Gradient-1048k": {
"tokenizer": "llama3",
"organization": "Meta",
"vocab_size": 128256,
"num(digit)": 1110,
"len(digit)": "1,3,3",
"num(space)": 60860,
"len(space)": "1,6,128",
"num(ar)": 3810,
"len(ar)": "1,4,11",
"num(zh)": 4424,
"len(zh)": "1,1,7",
"num(ja)": 5387,
"len(ja)": "1,2,8",
"num(ja-kana)": 1086,
"len(ja-kana)": "1,2,8",
"num(ko)": 2281,
"len(ko)": "1,2,6"
},
"bigscience/bloom": {
"tokenizer": "bloom",
"organization": "BigScience",
"vocab_size": 250680,
"num(digit)": 6629,
"len(digit)": "1,4,50",
"num(space)": 140180,
"len(space)": "1,6,600",
"num(ar)": 20854,
"len(ar)": "1,5,16",
"num(zh)": 30603,
"len(zh)": "1,2,23",
"num(ja)": 30816,
"len(ja)": "1,2,23",
"num(ja-kana)": 214,
"len(ja-kana)": "1,1,3",
"num(ko)": 338,
"len(ko)": "1,1,3"
},
"huggyllama/llama-7b": {
"tokenizer": "llama",
"organization": "Meta",
"vocab_size": 32000,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 700,
"len(zh)": "1,1,1",
"num(ja)": 837,
"len(ja)": "1,1,1",
"num(ja-kana)": 137,
"len(ja-kana)": "1,1,1",
"num(ko)": 111,
"len(ko)": "1,1,1"
},
"baichuan-inc/Baichuan-7B": {
"tokenizer": "baichuan",
"organization": "Baichuan",
"vocab_size": 64000,
"num(digit)": 335,
"len(digit)": "1,14,14",
"num(space)": 13,
"len(space)": "1,1,1",
"num(ar)": 299,
"len(ar)": "1,1,2",
"num(zh)": 27676,
"len(zh)": "1,1,9",
"num(ja)": 28522,
"len(ja)": "1,1,9",
"num(ja-kana)": 178,
"len(ja-kana)": "1,1,1",
"num(ko)": 1591,
"len(ko)": "1,1,1"
},
"01-ai/Yi-34B": {
"tokenizer": "Yi-34B",
"organization": "Yi",
"vocab_size": 64000,
"num(digit)": 200,
"len(digit)": "1,13,15",
"num(space)": 24274,
"len(space)": "1,7,16",
"num(ar)": 18,
"len(ar)": "1,1,4",
"num(zh)": 21356,
"len(zh)": "1,2,12",
"num(ja)": 21407,
"len(ja)": "1,2,12",
"num(ja-kana)": 51,
"len(ja-kana)": "1,1,2",
"num(ko)": 28,
"len(ko)": "1,1,2"
},
"01-ai/Yi-6B": {
"tokenizer": "Yi-6B",
"organization": "Yi",
"vocab_size": 64000,
"num(digit)": 200,
"len(digit)": "1,13,15",
"num(space)": 24274,
"len(space)": "1,7,16",
"num(ar)": 18,
"len(ar)": "1,1,4",
"num(zh)": 21356,
"len(zh)": "1,2,12",
"num(ja)": 21407,
"len(ja)": "1,2,12",
"num(ja-kana)": 51,
"len(ja-kana)": "1,1,2",
"num(ko)": 28,
"len(ko)": "1,1,2"
},
"01-ai/Yi-VL-34B": {
"tokenizer": "Yi-VL-34B",
"organization": "Yi",
"vocab_size": 64000,
"num(digit)": 200,
"len(digit)": "1,13,15",
"num(space)": 43,
"len(space)": "1,2,15",
"num(ar)": 18,
"len(ar)": "1,1,4",
"num(zh)": 21356,
"len(zh)": "1,2,12",
"num(ja)": 21407,
"len(ja)": "1,2,12",
"num(ja-kana)": 51,
"len(ja-kana)": "1,1,2",
"num(ko)": 28,
"len(ko)": "1,1,2"
},
"ClassCat/gpt2-base-french": {
"tokenizer": "gpt2-base-french",
"organization": "ClassCat",
"vocab_size": 50000,
"num(digit)": 1833,
"len(digit)": "1,4,5",
"num(space)": 31889,
"len(space)": "1,7,32",
"num(ar)": 41,
"len(ar)": "1,1,4",
"num(zh)": 27,
"len(zh)": "1,1,1",
"num(ja)": 46,
"len(ja)": "1,1,2",
"num(ja-kana)": 19,
"len(ja-kana)": "1,1,2",
"num(ko)": 0,
"len(ko)": "-"
},
"ClassCat/gpt2-base-spanish": {
"tokenizer": "gpt2-base-spanish",
"organization": "ClassCat",
"vocab_size": 50000,
"num(digit)": 1492,
"len(digit)": "1,4,9",
"num(space)": 34496,
"len(space)": "1,8,32",
"num(ar)": 36,
"len(ar)": "1,1,4",
"num(zh)": 13,
"len(zh)": "1,1,1",
"num(ja)": 36,
"len(ja)": "1,1,2",
"num(ja-kana)": 23,
"len(ja-kana)": "1,1,2",
"num(ko)": 0,
"len(ko)": "-"
},
"ClueAI/PromptCLUE-base": {
"tokenizer": "PromptCLUE-base",
"organization": "CLUE",
"vocab_size": 32128,
"num(digit)": 740,
"len(digit)": "1,3,9",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 2,
"len(ar)": "1,1,1",
"num(zh)": 29591,
"len(zh)": "1,2,16",
"num(ja)": 29736,
"len(ja)": "1,2,16",
"num(ja-kana)": 145,
"len(ja-kana)": "1,1,2",
"num(ko)": 0,
"len(ko)": "-"
},
"CohereForAI/aya-101": {
"tokenizer": "aya-101",
"organization": "Cohere For AI",
"vocab_size": 250100,
"num(digit)": 16829,
"len(digit)": "1,4,16",
"num(space)": 1,
"len(space)": "1,1,1",
"num(ar)": 7459,
"len(ar)": "1,3,16",
"num(zh)": 21489,
"len(zh)": "1,2,16",
"num(ja)": 27078,
"len(ja)": "1,2,16",
"num(ja-kana)": 9160,
"len(ja-kana)": "1,3,14",
"num(ko)": 4041,
"len(ko)": "1,1,10"
},
"EleutherAI/gpt-neox-20b": {
"tokenizer": "gpt-neox-20b",
"organization": "EleutherAI",
"vocab_size": 50277,
"num(digit)": 2036,
"len(digit)": "1,3,35",
"num(space)": 28996,
"len(space)": "1,7,512",
"num(ar)": 94,
"len(ar)": "1,2,4",
"num(zh)": 313,
"len(zh)": "1,1,2",
"num(ja)": 480,
"len(ja)": "1,1,4",
"num(ja-kana)": 167,
"len(ja-kana)": "1,1,4",
"num(ko)": 25,
"len(ko)": "1,1,2"
},
"HuggingFaceH4/starchat-alpha": {
"tokenizer": "starchat-alpha",
"organization": "-",
"vocab_size": 49156,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 16515,
"len(space)": "1,6,256",
"num(ar)": 84,
"len(ar)": "1,2,4",
"num(zh)": 2030,
"len(zh)": "1,1,7",
"num(ja)": 2368,
"len(ja)": "1,1,8",
"num(ja-kana)": 360,
"len(ja-kana)": "1,2,8",
"num(ko)": 491,
"len(ko)": "1,2,5"
},
"HuggingFaceH4/zephyr-7b-beta": {
"tokenizer": "zephyr-7b-beta",
"organization": "HuggingFace",
"vocab_size": 32000,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 85,
"len(space)": "1,3,15",
"num(ar)": 71,
"len(ar)": "1,1,2",
"num(zh)": 1459,
"len(zh)": "1,1,2",
"num(ja)": 1593,
"len(ja)": "1,1,2",
"num(ja-kana)": 134,
"len(ja-kana)": "1,1,1",
"num(ko)": 346,
"len(ko)": "1,1,1"
},
"LLM360/CrystalCoder": {
"tokenizer": "CrystalCoder",
"organization": "MBZUAI",
"vocab_size": 32022,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 700,
"len(zh)": "1,1,1",
"num(ja)": 837,
"len(ja)": "1,1,1",
"num(ja-kana)": 137,
"len(ja-kana)": "1,1,1",
"num(ko)": 111,
"len(ko)": "1,1,1"
},
"NousResearch/Llama-2-7b-chat-hf": {
"tokenizer": "llama2",
"organization": "Meta",
"vocab_size": 32001,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 700,
"len(zh)": "1,1,1",
"num(ja)": 837,
"len(ja)": "1,1,1",
"num(ja-kana)": 137,
"len(ja-kana)": "1,1,1",
"num(ko)": 111,
"len(ko)": "1,1,1"
},
"OrionStarAI/Orion-14B-Chat": {
"tokenizer": "Orion-14B-Chat",
"organization": "OrionStar",
"vocab_size": 84608,
"num(digit)": 1559,
"len(digit)": "1,4,14",
"num(space)": 18383,
"len(space)": "1,6,16",
"num(ar)": 102,
"len(ar)": "1,1,1",
"num(zh)": 46998,
"len(zh)": "1,2,16",
"num(ja)": 49644,
"len(ja)": "1,2,16",
"num(ja-kana)": 2987,
"len(ja-kana)": "1,3,11",
"num(ko)": 5110,
"len(ko)": "1,2,7"
},
"Qwen/Qwen-7B-Chat": {
"tokenizer": "Qwen",
"organization": "Alibaba",
"vocab_size": 151851,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 55883,
"len(space)": "1,6,128",
"num(ar)": 4018,
"len(ar)": "1,3,12",
"num(zh)": 25557,
"len(zh)": "1,2,7",
"num(ja)": 27206,
"len(ja)": "1,2,11",
"num(ja-kana)": 2089,
"len(ja-kana)": "1,3,11",
"num(ko)": 3495,
"len(ko)": "1,1,5"
},
"Qwen/Qwen1.5-14B-Chat": {
"tokenizer": "Qwen1.5",
"organization": "Alibaba",
"vocab_size": 151646,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 55883,
"len(space)": "1,6,128",
"num(ar)": 4018,
"len(ar)": "1,3,12",
"num(zh)": 25557,
"len(zh)": "1,2,7",
"num(ja)": 27206,
"len(ja)": "1,2,11",
"num(ja-kana)": 2089,
"len(ja-kana)": "1,3,11",
"num(ko)": 3495,
"len(ko)": "1,1,5"
},
"Skywork/Skywork-13B-Math": {
"tokenizer": "Skywork-13B-Math",
"organization": "Kunlun",
"vocab_size": 65519,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 62,
"len(space)": "1,2,15",
"num(ar)": 56,
"len(ar)": "1,1,2",
"num(zh)": 33913,
"len(zh)": "1,2,5",
"num(ja)": 34064,
"len(ja)": "1,2,5",
"num(ja-kana)": 150,
"len(ja-kana)": "1,1,1",
"num(ko)": 111,
"len(ko)": "1,1,1"
},
"Skywork/Skywork-13B-base": {
"tokenizer": "Skywork-13B-base",
"organization": "Kunlun",
"vocab_size": 65519,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 62,
"len(space)": "1,2,15",
"num(ar)": 56,
"len(ar)": "1,1,2",
"num(zh)": 33913,
"len(zh)": "1,2,5",
"num(ja)": 34064,
"len(ja)": "1,2,5",
"num(ja-kana)": 150,
"len(ja-kana)": "1,1,1",
"num(ko)": 111,
"len(ko)": "1,1,1"
},
"THUDM/chatglm-6b": {
"tokenizer": "chatglm-6b",
"organization": "Tsinghua",
"vocab_size": 130344,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 93,
"len(space)": "1,34,80",
"num(ar)": 137,
"len(ar)": "1,2,4",
"num(zh)": 61358,
"len(zh)": "1,2,16",
"num(ja)": 61784,
"len(ja)": "1,2,16",
"num(ja-kana)": 439,
"len(ja-kana)": "1,2,5",
"num(ko)": 114,
"len(ko)": "1,1,3"
},
"THUDM/chatglm2-6b": {
"tokenizer": "chatglm2-6b",
"organization": "Tsinghua",
"vocab_size": 64787,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 67,
"len(space)": "1,2,15",
"num(ar)": 57,
"len(ar)": "1,1,2",
"num(zh)": 30922,
"len(zh)": "1,2,16",
"num(ja)": 31065,
"len(ja)": "1,2,16",
"num(ja-kana)": 143,
"len(ja-kana)": "1,1,1",
"num(ko)": 604,
"len(ko)": "1,1,1"
},
"THUDM/chatglm3-6b": {
"tokenizer": "chatglm3-6b",
"organization": "Tsinghua",
"vocab_size": 64796,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 67,
"len(space)": "1,2,15",
"num(ar)": 57,
"len(ar)": "1,1,2",
"num(zh)": 30922,
"len(zh)": "1,2,16",
"num(ja)": 31065,
"len(ja)": "1,2,16",
"num(ja-kana)": 143,
"len(ja-kana)": "1,1,1",
"num(ko)": 604,
"len(ko)": "1,1,1"
},
"TigerResearch/tigerbot-13b-chat-v2": {
"tokenizer": "tigerbot-13b-chat-v2",
"organization": "Tigerobo",
"vocab_size": 60515,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 28603,
"len(zh)": "1,2,16",
"num(ja)": 28770,
"len(ja)": "1,2,16",
"num(ja-kana)": 167,
"len(ja-kana)": "1,1,2",
"num(ko)": 261,
"len(ko)": "1,1,1"
},
"TigerResearch/tigerbot-70b-chat-v4-4k": {
"tokenizer": "tigerbot-70b-chat-v4-4k",
"organization": "Tigerobo",
"vocab_size": 65110,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 30509,
"len(zh)": "1,2,16",
"num(ja)": 32061,
"len(ja)": "1,2,16",
"num(ja-kana)": 2071,
"len(ja-kana)": "1,2,8",
"num(ko)": 1504,
"len(ko)": "1,1,5"
},
"Upstage/SOLAR-10.7B-v1.0": {
"tokenizer": "SOLAR-10.7B-v1.0",
"organization": "-",
"vocab_size": 32000,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 85,
"len(space)": "1,3,15",
"num(ar)": 71,
"len(ar)": "1,1,2",
"num(zh)": 1459,
"len(zh)": "1,1,2",
"num(ja)": 1593,
"len(ja)": "1,1,2",
"num(ja-kana)": 134,
"len(ja-kana)": "1,1,1",
"num(ko)": 346,
"len(ko)": "1,1,1"
},
"WizardLM/WizardCoder-15B-V1.0": {
"tokenizer": "WizardCoder-15B-V1.0",
"organization": "Microsoft",
"vocab_size": 49153,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 16515,
"len(space)": "1,6,256",
"num(ar)": 84,
"len(ar)": "1,2,4",
"num(zh)": 2030,
"len(zh)": "1,1,7",
"num(ja)": 2368,
"len(ja)": "1,1,8",
"num(ja-kana)": 360,
"len(ja-kana)": "1,2,8",
"num(ko)": 491,
"len(ko)": "1,2,5"
},
"WizardLM/WizardCoder-Python-7B-V1.0": {
"tokenizer": "WizardCoder-Python-7B-V1.0",
"organization": "Microsoft",
"vocab_size": 32001,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 700,
"len(zh)": "1,1,1",
"num(ja)": 837,
"len(ja)": "1,1,1",
"num(ja-kana)": 137,
"len(ja-kana)": "1,1,1",
"num(ko)": 111,
"len(ko)": "1,1,1"
},
"WizardLM/WizardLM-7B-V1.0": {
"tokenizer": "WizardLM-7B-V1.0",
"organization": "Microsoft",
"vocab_size": 32001,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 700,
"len(zh)": "1,1,1",
"num(ja)": 837,
"len(ja)": "1,1,1",
"num(ja-kana)": 137,
"len(ja-kana)": "1,1,1",
"num(ko)": 111,
"len(ko)": "1,1,1"
},
"WizardLM/WizardMath-70B-V1.0": {
"tokenizer": "WizardMath-70B-V1.0",
"organization": "Microsoft",
"vocab_size": 32002,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 700,
"len(zh)": "1,1,1",
"num(ja)": 837,
"len(ja)": "1,1,1",
"num(ja-kana)": 137,
"len(ja-kana)": "1,1,1",
"num(ko)": 111,
"len(ko)": "1,1,1"
},
"abeja/gpt-neox-japanese-2.7b": {
"tokenizer": "gpt-neox-japanese-2.7b",
"organization": "ABEJA",
"vocab_size": 32000,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 0,
"len(ar)": "-",
"num(zh)": 15176,
"len(zh)": "1,2,2",
"num(ja)": 31482,
"len(ja)": "1,2,3",
"num(ja-kana)": 16306,
"len(ja-kana)": "1,3,3",
"num(ko)": 0,
"len(ko)": "-"
},
"ai21labs/Jamba-v0.1": {
"tokenizer": "Jamba-v0.1",
"organization": "AI21",
"vocab_size": 65536,
"num(digit)": 1556,
"len(digit)": "1,16,17",
"num(space)": 39501,
"len(space)": "1,7,32",
"num(ar)": 867,
"len(ar)": "1,3,8",
"num(zh)": 1157,
"len(zh)": "1,1,2",
"num(ja)": 1287,
"len(ja)": "1,1,2",
"num(ja-kana)": 130,
"len(ja-kana)": "1,1,2",
"num(ko)": 312,
"len(ko)": "1,1,2"
},
"allenai/OLMo-7B": {
"tokenizer": "OLMo-7B",
"organization": "Allen AI",
"vocab_size": 50280,
"num(digit)": 2036,
"len(digit)": "1,3,35",
"num(space)": 29019,
"len(space)": "1,7,512",
"num(ar)": 94,
"len(ar)": "1,2,4",
"num(zh)": 313,
"len(zh)": "1,1,2",
"num(ja)": 480,
"len(ja)": "1,1,4",
"num(ja-kana)": 167,
"len(ja-kana)": "1,1,4",
"num(ko)": 25,
"len(ko)": "1,1,2"
},
"baichuan-inc/Baichuan2-7B-Chat": {
"tokenizer": "baichuan2",
"organization": "Baichuan",
"vocab_size": 125696,
"num(digit)": 1023,
"len(digit)": "1,14,14",
"num(space)": 26013,
"len(space)": "1,7,32",
"num(ar)": 335,
"len(ar)": "1,1,27",
"num(zh)": 70398,
"len(zh)": "1,2,32",
"num(ja)": 71269,
"len(ja)": "1,2,32",
"num(ja-kana)": 206,
"len(ja-kana)": "1,1,9",
"num(ko)": 1595,
"len(ko)": "1,1,2"
},
"ckiplab/gpt2-base-chinese": {
"tokenizer": "gpt2-base-chinese",
"organization": "SINICA",
"vocab_size": 21128,
"num(digit)": 1451,
"len(digit)": "1,3,12",
"num(space)": 2,
"len(space)": "1,2,3",
"num(ar)": 30,
"len(ar)": "1,2,3",
"num(zh)": 14642,
"len(zh)": "1,2,3",
"num(ja)": 15197,
"len(ja)": "1,3,15",
"num(ja-kana)": 553,
"len(ja-kana)": "1,3,15",
"num(ko)": 0,
"len(ko)": "-"
},
"cyberagent/open-calm-7b": {
"tokenizer": "open-calm-7b",
"organization": "CyberAgent",
"vocab_size": 52000,
"num(digit)": 690,
"len(digit)": "1,3,5",
"num(space)": 1698,
"len(space)": "1,4,33",
"num(ar)": 10,
"len(ar)": "1,1,4",
"num(zh)": 30775,
"len(zh)": "1,3,31",
"num(ja)": 45790,
"len(ja)": "1,3,31",
"num(ja-kana)": 32535,
"len(ja-kana)": "1,3,31",
"num(ko)": 0,
"len(ko)": "-"
},
"databricks/dbrx-instruct": {
"tokenizer": "dbrx-instruct",
"organization": "Databricks",
"vocab_size": 100280,
"num(digit)": 1126,
"len(digit)": "1,3,17",
"num(space)": 47400,
"len(space)": "1,7,128",
"num(ar)": 113,
"len(ar)": "1,2,10",
"num(zh)": 868,
"len(zh)": "1,1,7",
"num(ja)": 1035,
"len(ja)": "1,1,7",
"num(ja-kana)": 169,
"len(ja-kana)": "1,1,7",
"num(ko)": 299,
"len(ko)": "1,2,4"
},
"deepseek-ai/DeepSeek-V2": {
"tokenizer": "DeepSeek-V2",
"organization": "DeepSeek",
"vocab_size": 100002,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 48073,
"len(space)": "1,7,128",
"num(ar)": 48,
"len(ar)": "1,1,4",
"num(zh)": 18052,
"len(zh)": "1,2,16",
"num(ja)": 18090,
"len(ja)": "1,2,16",
"num(ja-kana)": 38,
"len(ja-kana)": "1,1,2",
"num(ko)": 16,
"len(ko)": "1,1,2"
},
"deepseek-ai/deepseek-coder-33b-instruct": {
"tokenizer": "deepseek-coder-33b-instruct",
"organization": "DeepSeek",
"vocab_size": 32022,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 15254,
"len(space)": "1,6,65",
"num(ar)": 12,
"len(ar)": "1,1,2",
"num(zh)": 4803,
"len(zh)": "1,2,4",
"num(ja)": 4804,
"len(ja)": "1,2,4",
"num(ja-kana)": 1,
"len(ja-kana)": "1,1,1",
"num(ko)": 0,
"len(ko)": "-"
},
"deepseek-ai/deepseek-llm-7b-base": {
"tokenizer": "deepseek-llm-7b-base",
"organization": "DeepSeek",
"vocab_size": 100015,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 48073,
"len(space)": "1,7,128",
"num(ar)": 48,
"len(ar)": "1,1,4",
"num(zh)": 18052,
"len(zh)": "1,2,16",
"num(ja)": 18090,
"len(ja)": "1,2,16",
"num(ja-kana)": 38,
"len(ja-kana)": "1,1,2",
"num(ko)": 16,
"len(ko)": "1,1,2"
},
"eson/kplug-base-encoder": {
"tokenizer": "kplug",
"organization": "JD",
"vocab_size": 10261,
"num(digit)": 420,
"len(digit)": "1,3,12",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 0,
"len(ar)": "-",
"num(zh)": 5764,
"len(zh)": "1,1,1",
"num(ja)": 5766,
"len(ja)": "1,1,3",
"num(ja-kana)": 0,
"len(ja-kana)": "-",
"num(ko)": 0,
"len(ko)": "-"
},
"fnlp/moss-moon-003-sft": {
"tokenizer": "moss-moon-003-sft",
"organization": "Fudan",
"vocab_size": 106072,
"num(digit)": 1848,
"len(digit)": "1,3,16",
"num(space)": 33566,
"len(space)": "1,7,102",
"num(ar)": 25,
"len(ar)": "1,1,4",
"num(zh)": 54230,
"len(zh)": "1,2,15",
"num(ja)": 54381,
"len(ja)": "1,2,15",
"num(ja-kana)": 152,
"len(ja-kana)": "1,1,7",
"num(ko)": 0,
"len(ko)": "-"
},
"google/gemma-7b": {
"tokenizer": "gemma-7b",
"organization": "Google",
"vocab_size": 256000,
"num(digit)": 134,
"len(digit)": "1,10,12",
"num(space)": 125662,
"len(space)": "1,7,31",
"num(ar)": 6274,
"len(ar)": "1,4,15",
"num(zh)": 23767,
"len(zh)": "1,2,12",
"num(ja)": 28852,
"len(ja)": "1,2,12",
"num(ja-kana)": 7061,
"len(ja-kana)": "1,3,12",
"num(ko)": 2295,
"len(ko)": "1,1,5"
},
"google/switch-c-2048": {
"tokenizer": "switch-c-2048",
"organization": "Google",
"vocab_size": 32100,
"num(digit)": 1133,
"len(digit)": "1,3,13",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 0,
"len(ar)": "-",
"num(zh)": 0,
"len(zh)": "-",
"num(ja)": 0,
"len(ja)": "-",
"num(ja-kana)": 0,
"len(ja-kana)": "-",
"num(ko)": 0,
"len(ko)": "-"
},
"hfl/chinese-alpaca-lora-7b": {
"tokenizer": "chinese-alpaca-lora-7b",
"organization": "-",
"vocab_size": 49954,
"num(digit)": 614,
"len(digit)": "1,3,5",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 17839,
"len(zh)": "1,2,13",
"num(ja)": 17993,
"len(ja)": "1,2,13",
"num(ja-kana)": 154,
"len(ja-kana)": "1,1,1",
"num(ko)": 135,
"len(ko)": "1,1,1"
},
"hfl/chinese-llama-2-7b": {
"tokenizer": "chinese-llama-2-7b",
"organization": "-",
"vocab_size": 55296,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 23974,
"len(zh)": "1,2,16",
"num(ja)": 24111,
"len(ja)": "1,2,16",
"num(ja-kana)": 137,
"len(ja-kana)": "1,1,1",
"num(ko)": 111,
"len(ko)": "1,1,1"
},
"hfl/chinese-llama-lora-7b": {
"tokenizer": "chinese-llama-lora-7b",
"organization": "-",
"vocab_size": 49953,
"num(digit)": 614,
"len(digit)": "1,3,5",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 17839,
"len(zh)": "1,2,13",
"num(ja)": 17993,
"len(ja)": "1,2,13",
"num(ja-kana)": 154,
"len(ja-kana)": "1,1,1",
"num(ko)": 135,
"len(ko)": "1,1,1"
},
"hfl/llama-3-chinese-8b": {
"tokenizer": "llama-3-chinese-8b",
"organization": "-",
"vocab_size": 128256,
"num(digit)": 1110,
"len(digit)": "1,3,3",
"num(space)": 60860,
"len(space)": "1,6,128",
"num(ar)": 3810,
"len(ar)": "1,4,11",
"num(zh)": 4424,
"len(zh)": "1,1,7",
"num(ja)": 5387,
"len(ja)": "1,2,8",
"num(ja-kana)": 1086,
"len(ja-kana)": "1,2,8",
"num(ko)": 2281,
"len(ko)": "1,2,6"
},
"hpcai-tech/grok-1": {
"tokenizer": "grok-1",
"organization": "xAI",
"vocab_size": 131072,
"num(digit)": 40,
"len(digit)": "1,6,13",
"num(space)": 399,
"len(space)": "1,3,16",
"num(ar)": 69,
"len(ar)": "1,2,4",
"num(zh)": 1626,
"len(zh)": "1,2,7",
"num(ja)": 3118,
"len(ja)": "1,2,8",
"num(ja-kana)": 1908,
"len(ja-kana)": "1,2,8",
"num(ko)": 67,
"len(ko)": "1,1,2"
},
"internlm/internlm-chat-7b": {
"tokenizer": "internlm-chat-7b",
"organization": "Shanghai AI Lab",
"vocab_size": 103168,
"num(digit)": 1259,
"len(digit)": "1,3,19",
"num(space)": 33008,
"len(space)": "1,6,128",
"num(ar)": 6702,
"len(ar)": "1,4,16",
"num(zh)": 32000,
"len(zh)": "1,2,15",
"num(ja)": 32866,
"len(ja)": "1,2,15",
"num(ja-kana)": 864,
"len(ja-kana)": "1,2,9",
"num(ko)": 298,
"len(ko)": "1,1,1"
},
"internlm/internlm-xcomposer-7b": {
"tokenizer": "internlm-xcomposer-7b",
"organization": "Shanghai AI Lab",
"vocab_size": 103168,
"num(digit)": 1261,
"len(digit)": "1,3,19",
"num(space)": 33008,
"len(space)": "1,6,128",
"num(ar)": 6702,
"len(ar)": "1,4,16",
"num(zh)": 32000,
"len(zh)": "1,2,15",
"num(ja)": 32866,
"len(ja)": "1,2,15",
"num(ja-kana)": 864,
"len(ja-kana)": "1,2,9",
"num(ko)": 298,
"len(ko)": "1,1,1"
},
"internlm/internlm2-chat-7b": {
"tokenizer": "internlm2-chat-7b",
"organization": "Shanghai AI Lab",
"vocab_size": 92544,
"num(digit)": 1261,
"len(digit)": "1,3,18",
"num(space)": 28681,
"len(space)": "1,7,128",
"num(ar)": 30,
"len(ar)": "1,1,1",
"num(zh)": 31148,
"len(zh)": "1,2,15",
"num(ja)": 31296,
"len(ja)": "1,2,15",
"num(ja-kana)": 148,
"len(ja-kana)": "1,1,1",
"num(ko)": 83,
"len(ko)": "1,1,1"
},
"internlm/internlm2-math-7b": {
"tokenizer": "internlm2-math-7b",
"organization": "Shanghai AI Lab",
"vocab_size": 92544,
"num(digit)": 1261,
"len(digit)": "1,3,18",
"num(space)": 28681,
"len(space)": "1,7,128",
"num(ar)": 30,
"len(ar)": "1,1,1",
"num(zh)": 31148,
"len(zh)": "1,2,15",
"num(ja)": 31296,
"len(ja)": "1,2,15",
"num(ja-kana)": 148,
"len(ja-kana)": "1,1,1",
"num(ko)": 83,
"len(ko)": "1,1,1"
},
"microsoft/Phi-3-mini-4k-instruct": {
"tokenizer": "Phi-3-mini-4k-instruct",
"organization": "Microsoft",
"vocab_size": 32011,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 61,
"len(space)": "1,2,15",
"num(ar)": 55,
"len(ar)": "1,1,2",
"num(zh)": 700,
"len(zh)": "1,1,1",
"num(ja)": 837,
"len(ja)": "1,1,1",
"num(ja-kana)": 137,
"len(ja-kana)": "1,1,1",
"num(ko)": 111,
"len(ko)": "1,1,1"
},
"microsoft/phi-1": {
"tokenizer": "phi-1",
"organization": "Microsoft",
"vocab_size": 50295,
"num(digit)": 1691,
"len(digit)": "1,3,16",
"num(space)": 33129,
"len(space)": "1,7,66",
"num(ar)": 22,
"len(ar)": "1,1,3",
"num(zh)": 51,
"len(zh)": "1,1,4",
"num(ja)": 183,
"len(ja)": "1,1,7",
"num(ja-kana)": 133,
"len(ja-kana)": "1,1,7",
"num(ko)": 0,
"len(ko)": "-"
},
"microsoft/phi-2": {
"tokenizer": "phi-2",
"organization": "Microsoft",
"vocab_size": 50295,
"num(digit)": 1691,
"len(digit)": "1,3,16",
"num(space)": 33129,
"len(space)": "1,7,66",
"num(ar)": 22,
"len(ar)": "1,1,3",
"num(zh)": 51,
"len(zh)": "1,1,4",
"num(ja)": 183,
"len(ja)": "1,1,7",
"num(ja-kana)": 133,
"len(ja-kana)": "1,1,7",
"num(ko)": 0,
"len(ko)": "-"
},
"mistralai/Mistral-7B-v0.1": {
"tokenizer": "Mistral-7B-v0.1",
"organization": "Mistral",
"vocab_size": 32000,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 85,
"len(space)": "1,3,15",
"num(ar)": 71,
"len(ar)": "1,1,2",
"num(zh)": 1459,
"len(zh)": "1,1,2",
"num(ja)": 1593,
"len(ja)": "1,1,2",
"num(ja-kana)": 134,
"len(ja-kana)": "1,1,1",
"num(ko)": 346,
"len(ko)": "1,1,1"
},
"mistralai/Mixtral-8x7B-v0.1": {
"tokenizer": "Mixtral-8x7B-v0.1",
"organization": "Mistral",
"vocab_size": 32000,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 85,
"len(space)": "1,3,15",
"num(ar)": 71,
"len(ar)": "1,1,2",
"num(zh)": 1459,
"len(zh)": "1,1,2",
"num(ja)": 1593,
"len(ja)": "1,1,2",
"num(ja-kana)": 134,
"len(ja-kana)": "1,1,1",
"num(ko)": 346,
"len(ko)": "1,1,1"
},
"openai-community/gpt2": {
"tokenizer": "gpt2",
"organization": "OpenAI",
"vocab_size": 50257,
"num(digit)": 1691,
"len(digit)": "1,3,16",
"num(space)": 33129,
"len(space)": "1,7,66",
"num(ar)": 22,
"len(ar)": "1,1,3",
"num(zh)": 51,
"len(zh)": "1,1,4",
"num(ja)": 183,
"len(ja)": "1,1,7",
"num(ja-kana)": 133,
"len(ja-kana)": "1,1,7",
"num(ko)": 0,
"len(ko)": "-"
},
"openai/code-davinci-002": {
"tokenizer": "code-davinci-002",
"organization": "OpenAI",
"vocab_size": 50281,
"num(digit)": 1691,
"len(digit)": "1,3,16",
"num(space)": 33175,
"len(space)": "1,7,66",
"num(ar)": 22,
"len(ar)": "1,1,3",
"num(zh)": 51,
"len(zh)": "1,1,4",
"num(ja)": 183,
"len(ja)": "1,1,7",
"num(ja-kana)": 133,
"len(ja-kana)": "1,1,7",
"num(ko)": 0,
"len(ko)": "-"
},
"openai/gpt-3.5-turbo": {
"tokenizer": "gpt-3.5-turbo",
"organization": "OpenAI",
"vocab_size": 100277,
"num(digit)": 1110,
"len(digit)": "1,3,3",
"num(space)": 47472,
"len(space)": "1,7,128",
"num(ar)": 113,
"len(ar)": "1,2,10",
"num(zh)": 868,
"len(zh)": "1,1,7",
"num(ja)": 1035,
"len(ja)": "1,1,7",
"num(ja-kana)": 169,
"len(ja-kana)": "1,1,7",
"num(ko)": 299,
"len(ko)": "1,2,4"
},
"openai/gpt-4o": {
"tokenizer": "gpt-4o",
"organization": "OpenAI",
"vocab_size": 200019,
"num(digit)": 1110,
"len(digit)": "1,3,3",
"num(space)": 109316,
"len(space)": "1,6,128",
"num(ar)": 8055,
"len(ar)": "1,4,12",
"num(zh)": 7563,
"len(zh)": "1,2,11",
"num(ja)": 8292,
"len(ja)": "1,2,11",
"num(ja-kana)": 809,
"len(ja-kana)": "1,2,11",
"num(ko)": 2365,
"len(ko)": "1,2,8"
},
"openai/text-davinci-003": {
"tokenizer": "text-davinci-003",
"organization": "OpenAI",
"vocab_size": 50281,
"num(digit)": 1691,
"len(digit)": "1,3,16",
"num(space)": 33175,
"len(space)": "1,7,66",
"num(ar)": 22,
"len(ar)": "1,1,3",
"num(zh)": 51,
"len(zh)": "1,1,4",
"num(ja)": 183,
"len(ja)": "1,1,7",
"num(ja-kana)": 133,
"len(ja-kana)": "1,1,7",
"num(ko)": 0,
"len(ko)": "-"
},
"thu-coai/CharacterGLM-6B": {
"tokenizer": "CharacterGLM-6B",
"organization": "Tsinghua",
"vocab_size": 64789,
"num(digit)": 20,
"len(digit)": "1,1,1",
"num(space)": 67,
"len(space)": "1,2,15",
"num(ar)": 57,
"len(ar)": "1,1,2",
"num(zh)": 30922,
"len(zh)": "1,2,16",
"num(ja)": 31065,
"len(ja)": "1,2,16",
"num(ja-kana)": 143,
"len(ja-kana)": "1,1,1",
"num(ko)": 604,
"len(ko)": "1,1,1"
},
"tiiuae/falcon-180b": {
"tokenizer": "falcon-180b",
"organization": "TII",
"vocab_size": 65024,
"num(digit)": 1108,
"len(digit)": "1,3,3",
"num(space)": 40202,
"len(space)": "1,7,65",
"num(ar)": 21,
"len(ar)": "1,1,4",
"num(zh)": 1627,
"len(zh)": "1,1,3",
"num(ja)": 1652,
"len(ja)": "1,1,3",
"num(ja-kana)": 25,
"len(ja-kana)": "1,1,1",
"num(ko)": 1,
"len(ko)": "1,1,1"
},
"tiiuae/falcon-7b": {
"tokenizer": "falcon-7b",
"organization": "TII",
"vocab_size": 65024,
"num(digit)": 1108,
"len(digit)": "1,3,3",
"num(space)": 40202,
"len(space)": "1,7,65",
"num(ar)": 21,
"len(ar)": "1,1,4",
"num(zh)": 1627,
"len(zh)": "1,1,3",
"num(ja)": 1652,
"len(ja)": "1,1,3",
"num(ja-kana)": 25,
"len(ja-kana)": "1,1,1",
"num(ko)": 1,
"len(ko)": "1,1,1"
},
"Qwen/Qwen1.5-1.8B": {
"tokenizer": "Qwen1.5-1.8B",
"organization": "Alibaba",
"vocab_size": 151646,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 55883,
"len(space)": "1,6,128",
"num(ar)": 4018,
"len(ar)": "1,3,12",
"num(zh)": 25557,
"len(zh)": "1,2,7",
"num(ja)": 27206,
"len(ja)": "1,2,11",
"num(ja-kana)": 2089,
"len(ja-kana)": "1,3,11",
"num(ko)": 3495,
"len(ko)": "1,1,5"
},
"Qwen/Qwen1.5-110B": {
"tokenizer": "Qwen1.5-110B",
"organization": "Alibaba",
"vocab_size": 151646,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 55883,
"len(space)": "1,6,128",
"num(ar)": 4018,
"len(ar)": "1,3,12",
"num(zh)": 25557,
"len(zh)": "1,2,7",
"num(ja)": 27206,
"len(ja)": "1,2,11",
"num(ja-kana)": 2089,
"len(ja-kana)": "1,3,11",
"num(ko)": 3495,
"len(ko)": "1,1,5"
},
"Qwen/Qwen1.5-14B": {
"tokenizer": "Qwen1.5-14B",
"organization": "Alibaba",
"vocab_size": 151646,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 55883,
"len(space)": "1,6,128",
"num(ar)": 4018,
"len(ar)": "1,3,12",
"num(zh)": 25557,
"len(zh)": "1,2,7",
"num(ja)": 27206,
"len(ja)": "1,2,11",
"num(ja-kana)": 2089,
"len(ja-kana)": "1,3,11",
"num(ko)": 3495,
"len(ko)": "1,1,5"
},
"asafaya/bert-base-arabic": {
"tokenizer": "bert-base-arabic",
"organization": "-",
"vocab_size": 32000,
"num(digit)": 507,
"len(digit)": "1,3,21",
"num(space)": 0,
"len(space)": "-",
"num(ar)": 28367,
"len(ar)": "1,5,34",
"num(zh)": 180,
"len(zh)": "1,1,1",
"num(ja)": 333,
"len(ja)": "1,1,3",
"num(ja-kana)": 153,
"len(ja-kana)": "1,1,3",
"num(ko)": 0,
"len(ko)": "-"
},
"rinna/bilingual-gpt-neox-4b": {
"tokenizer": "bilingual-gpt-neox-4b",
"organization": "ABEJA",
"vocab_size": 65536,
"num(digit)": 266,
"len(digit)": "1,6,6",
"num(space)": 3,
"len(space)": "1,1,1",
"num(ar)": 108,
"len(ar)": "1,1,4",
"num(zh)": 30158,
"len(zh)": "1,2,16",
"num(ja)": 40298,
"len(ja)": "1,3,16",
"num(ja-kana)": 21366,
"len(ja-kana)": "1,4,16",
"num(ko)": 384,
"len(ko)": "1,1,1"
},
"01-ai/Yi-1.5-34B": {
"tokenizer": "Yi-1.5-34B",
"organization": "Yi",
"vocab_size": 63992,
"num(digit)": 195,
"len(digit)": "1,13,13",
"num(space)": 43,
"len(space)": "1,2,15",
"num(ar)": 18,
"len(ar)": "1,1,4",
"num(zh)": 21350,
"len(zh)": "1,2,12",
"num(ja)": 21401,
"len(ja)": "1,2,12",
"num(ja-kana)": 51,
"len(ja-kana)": "1,1,2",
"num(ko)": 28,
"len(ko)": "1,1,2"
},
"Qwen/Qwen2-72B": {
"tokenizer": "Qwen2-72B",
"organization": "Alibaba",
"vocab_size": 151646,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 55883,
"len(space)": "1,6,128",
"num(ar)": 4018,
"len(ar)": "1,3,12",
"num(zh)": 25557,
"len(zh)": "1,2,7",
"num(ja)": 27206,
"len(ja)": "1,2,11",
"num(ja-kana)": 2089,
"len(ja-kana)": "1,3,11",
"num(ko)": 3495,
"len(ko)": "1,1,5"
},
"apple/DCLM-7B": {
"tokenizer": "DCLM-7B",
"organization": "Apple",
"vocab_size": 50277,
"num(digit)": 2036,
"len(digit)": "1,3,35",
"num(space)": 28996,
"len(space)": "1,7,512",
"num(ar)": 94,
"len(ar)": "1,2,4",
"num(zh)": 313,
"len(zh)": "1,1,2",
"num(ja)": 480,
"len(ja)": "1,1,4",
"num(ja-kana)": 167,
"len(ja-kana)": "1,1,4",
"num(ko)": 25,
"len(ko)": "1,1,2"
},
"google/gemma-2-9b": {
"tokenizer": "gemma-2-9b",
"organization": "Google",
"vocab_size": 256000,
"num(digit)": 134,
"len(digit)": "1,10,12",
"num(space)": 125662,
"len(space)": "1,7,31",
"num(ar)": 6274,
"len(ar)": "1,4,15",
"num(zh)": 23767,
"len(zh)": "1,2,12",
"num(ja)": 28852,
"len(ja)": "1,2,12",
"num(ja-kana)": 7061,
"len(ja-kana)": "1,3,12",
"num(ko)": 2295,
"len(ko)": "1,1,5"
},
"meta-llama/Meta-Llama-3.1-405B": {
"tokenizer": "llama3.1",
"organization": "Meta",
"vocab_size": 128256,
"num(digit)": 1110,
"len(digit)": "1,3,3",
"num(space)": 60860,
"len(space)": "1,6,128",
"num(ar)": 3810,
"len(ar)": "1,4,11",
"num(zh)": 4424,
"len(zh)": "1,1,7",
"num(ja)": 5387,
"len(ja)": "1,2,8",
"num(ja-kana)": 1086,
"len(ja-kana)": "1,2,8",
"num(ko)": 2281,
"len(ko)": "1,2,6"
},
"mistralai/Mistral-Large-Instruct-2407": {
"tokenizer": "Mistral-Large-Instruct-2407",
"organization": "Mistral",
"vocab_size": 32768,
"num(digit)": 775,
"len(digit)": "1,13,18",
"num(space)": 15823,
"len(space)": "1,6,16",
"num(ar)": 71,
"len(ar)": "1,1,3",
"num(zh)": 1459,
"len(zh)": "1,1,2",
"num(ja)": 1593,
"len(ja)": "1,1,2",
"num(ja-kana)": 134,
"len(ja-kana)": "1,1,1",
"num(ko)": 346,
"len(ko)": "1,1,1"
},
"mistralai/Mistral-Nemo-Instruct-2407": {
"tokenizer": "Mistral-Nemo-Instruct-2407",
"organization": "Mistral",
"vocab_size": 131072,
"num(digit)": 996,
"len(digit)": "1,13,13",
"num(space)": 75594,
"len(space)": "1,6,75",
"num(ar)": 9447,
"len(ar)": "1,4,13",
"num(zh)": 3765,
"len(zh)": "1,2,8",
"num(ja)": 5145,
"len(ja)": "1,2,11",
"num(ja-kana)": 1637,
"len(ja-kana)": "1,2,11",
"num(ko)": 4492,
"len(ko)": "1,3,6"
},
"allenai/OLMo-7B-hf": {
"tokenizer": "OLMo-7B-hf",
"organization": "Allen AI",
"vocab_size": 50280,
"num(digit)": 2036,
"len(digit)": "1,3,35",
"num(space)": 29019,
"len(space)": "1,7,512",
"num(ar)": 94,
"len(ar)": "1,2,4",
"num(zh)": 313,
"len(zh)": "1,1,2",
"num(ja)": 480,
"len(ja)": "1,1,4",
"num(ja-kana)": 167,
"len(ja-kana)": "1,1,4",
"num(ko)": 25,
"len(ko)": "1,1,2",
"num(la)": 48651,
"len(la)": "1,6,512"
},
"Qwen/Qwen2-0.5B": {
"tokenizer": "Qwen2-0.5B",
"organization": "Alibaba",
"vocab_size": 151646,
"num(digit)": 10,
"len(digit)": "1,1,1",
"num(space)": 55883,
"len(space)": "1,6,128",
"num(ar)": 4018,
"len(ar)": "1,3,12",
"num(zh)": 25557,
"len(zh)": "1,2,7",
"num(ja)": 27206,
"len(ja)": "1,2,11",
"num(ja-kana)": 2089,
"len(ja-kana)": "1,3,11",
"num(ko)": 3495,
"len(ko)": "1,1,5"
},
"NousResearch/Hermes-3-Llama-3.1-405B": {
"tokenizer": "Hermes-3-Llama-3.1-405B",
"organization": "NousResearch",
"vocab_size": 128256,
"num(digit)": 1110,
"len(digit)": "1,3,3",
"num(space)": 60860,
"len(space)": "1,6,128",
"num(ar)": 3810,
"len(ar)": "1,4,11",
"num(zh)": 4424,
"len(zh)": "1,1,7",
"num(ja)": 5387,
"len(ja)": "1,2,8",
"num(ja-kana)": 1086,
"len(ja-kana)": "1,2,8",
"num(ko)": 2281,
"len(ko)": "1,2,6"
}
}