arabic-tokenizers-leaderboard / arabic_tokenizers_leaderboard.jsonl
MohamedRashad's picture
refactor: improve tokenization for Arabic text
d2e2dfe
raw
history blame
2.9 kB
{"👳 Tokenize Tashkeel":"❌","📛 Models":"asafaya\/bert-base-arabic","🪺 Fertility Score":1.614,"➕ Total Number of Tokens":1242530,"📘 Vocab Size":32000,"Tokenizer Class":"BertTokenizerFast"}
{"👳 Tokenize Tashkeel":"✅","📛 Models":"core42\/jais-13b","🪺 Fertility Score":1.668,"➕ Total Number of Tokens":1284508,"📘 Vocab Size":84992,"Tokenizer Class":"PreTrainedTokenizerFast"}
{"👳 Tokenize Tashkeel":"✅","📛 Models":"core42\/jais-30b-chat-v3","🪺 Fertility Score":1.668,"➕ Total Number of Tokens":1284508,"📘 Vocab Size":84992,"Tokenizer Class":"PreTrainedTokenizerFast"}
{"👳 Tokenize Tashkeel":"✅","📛 Models":"FreedomIntelligence\/AceGPT-v1.5-13B-Chat","🪺 Fertility Score":1.888,"➕ Total Number of Tokens":1453838,"📘 Vocab Size":44800,"Tokenizer Class":"LlamaTokenizerFast"}
{"👳 Tokenize Tashkeel":"✅","📛 Models":"Xenova\/gpt-4o","🪺 Fertility Score":2.115,"➕ Total Number of Tokens":1628374,"📘 Vocab Size":200000,"Tokenizer Class":"GPT2TokenizerFast"}
{"👳 Tokenize Tashkeel":"❌","📛 Models":"CohereForAI\/c4ai-command-r-v01","🪺 Fertility Score":2.154,"➕ Total Number of Tokens":1658463,"📘 Vocab Size":255000,"Tokenizer Class":"CohereTokenizerFast"}
{"👳 Tokenize Tashkeel":"❌","📛 Models":"CohereForAI\/c4ai-command-r-plus","🪺 Fertility Score":2.154,"➕ Total Number of Tokens":1658463,"📘 Vocab Size":255000,"Tokenizer Class":"CohereTokenizerFast"}
{"👳 Tokenize Tashkeel":"✅","📛 Models":"unsloth\/gemma-2b-bnb-4bit","🪺 Fertility Score":2.199,"➕ Total Number of Tokens":1692826,"📘 Vocab Size":256000,"Tokenizer Class":"GemmaTokenizerFast"}
{"👳 Tokenize Tashkeel":"✅","📛 Models":"NousResearch\/Meta-Llama-3-8B","🪺 Fertility Score":2.374,"➕ Total Number of Tokens":1827816,"📘 Vocab Size":128000,"Tokenizer Class":"PreTrainedTokenizerFast"}
{"👳 Tokenize Tashkeel":"❌","📛 Models":"Qwen\/Qwen1.5-7B-Chat","🪺 Fertility Score":2.444,"➕ Total Number of Tokens":1881958,"📘 Vocab Size":151643,"Tokenizer Class":"Qwen2TokenizerFast"}
{"👳 Tokenize Tashkeel":"❌","📛 Models":"Qwen\/Qwen1.5-110B-Chat","🪺 Fertility Score":2.444,"➕ Total Number of Tokens":1881958,"📘 Vocab Size":151643,"Tokenizer Class":"Qwen2TokenizerFast"}
{"👳 Tokenize Tashkeel":"✅","📛 Models":"FreedomIntelligence\/AceGPT-13B","🪺 Fertility Score":5.46,"➕ Total Number of Tokens":4203685,"📘 Vocab Size":32000,"Tokenizer Class":"LlamaTokenizerFast"}
{"👳 Tokenize Tashkeel":"✅","📛 Models":"microsoft\/Phi-3-mini-128k-instruct","🪺 Fertility Score":5.46,"➕ Total Number of Tokens":4203685,"📘 Vocab Size":32000,"Tokenizer Class":"LlamaTokenizerFast"}
{"👳 Tokenize Tashkeel":"✅","📛 Models":"01-ai\/Yi-1.5-34B-Chat","🪺 Fertility Score":6.674,"➕ Total Number of Tokens":5138447,"📘 Vocab Size":64000,"Tokenizer Class":"LlamaTokenizerFast"}