|
import logging |
|
|
|
import tiktoken |
|
from transformers import AutoTokenizer |
|
|
|
import gradio as gr |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
def load_test_phrases(filename): |
|
with open(f"./data/{filename}", "r", encoding="utf-8") as file: |
|
return file.read().splitlines() |
|
|
|
|
|
models = ["Xenova/claude-tokenizer", |
|
"meta-llama/Llama-2-7b-chat-hf", |
|
"beomi/llama-2-ko-7b", |
|
"ai4bharat/Airavata", |
|
"openaccess-ai-collective/tiny-mistral", |
|
"gpt-3.5-turbo", |
|
"meta-llama/Meta-Llama-3-8B-Instruct", |
|
"CohereForAI/aya-23-8B", |
|
"google/gemma-1.1-2b-it", |
|
"gpt-4o", |
|
"TWO/sutra-mlt256-v2", |
|
"tamang0000/assamese-tokenizer-50k" |
|
] |
|
|
|
test_phrase_set = [ |
|
"I am going for a walk later today", |
|
"நாங்கள் சந்திரனுக்கு ராக்கெட் பயணத்தில் இருக்கிறோம்", |
|
|
|
"중성자 산란을 다섯 문장으로 설명해주세요", |
|
|
|
"मुझे पाँच वाक्यों में न्यूट्रॉन प्रकीर्णन की व्याख्या दीजिए", |
|
"mujhe paanch vaakyon mein nyootron prakeernan kee vyaakhya deejie", |
|
|
|
"আমাকে পাঁচটি বাক্যে নিউট্রন বিচ্ছুরণের একটি ব্যাখ্যা দিন", |
|
"Amake pamcati bakye ni'utrana bicchuranera ekati byakhya dina", |
|
|
|
"મને પાંચ વાક્યોમાં ન્યુટ્રોન સ્કેટરિંગની સમજૂતી આપો", |
|
"Mane panca vakyomam n'yutrona sketaringani samajuti apo", |
|
|
|
"நியூட்ரான் சிதறல் பற்றிய விளக்கத்தை ஐந்து வாக்கியங்களில் கொடுங்கள்", |
|
"Niyutran citaral parriya vilakkattai aintu vakkiyankalil kotunkal", |
|
|
|
"मला पाच वाक्यात न्यूट्रॉन स्कॅटरिंगचे स्पष्टीकरण द्या", |
|
|
|
"ఐదు వాక్యాలలో న్యూట్రాన్ స్కాటరింగ్ గురించి నాకు వివరణ ఇవ్వండి", |
|
] |
|
|
|
test_phrase_set_long_1 = load_test_phrases('multilingualphrases01.txt') |
|
test_phrase_set_long_2 = load_test_phrases('multilingualphrases02.txt') |
|
test_phrase_set_long_3 = load_test_phrases('multilingualphrases03.txt') |
|
|
|
|
|
def generate_tokens_as_table(text): |
|
table = [] |
|
for model in models: |
|
if 'gpt' not in model: |
|
tokenizer = AutoTokenizer.from_pretrained(model) |
|
tokens = tokenizer.encode(text, add_special_tokens=False) |
|
else: |
|
tokenizer = tiktoken.encoding_for_model(model) |
|
tokens = tokenizer.encode(text) |
|
decoded = [tokenizer.decode([t]) for t in tokens] |
|
table.append([model] + decoded) |
|
return table |
|
|
|
|
|
def generate_tokenizer_table(text): |
|
if not text: |
|
return [] |
|
|
|
token_counts = {model: 0 for model in models} |
|
vocab_size = {model: 0 for model in models} |
|
|
|
for model in models: |
|
if 'gpt' not in model: |
|
tokenizer = AutoTokenizer.from_pretrained(model) |
|
vocab_size[model] = tokenizer.vocab_size |
|
else: |
|
tokenizer = tiktoken.encoding_for_model(model) |
|
vocab_size[model] = tokenizer.n_vocab |
|
|
|
token_counts[model] += len(tokenizer.encode(text)) |
|
|
|
word_count = len(text.split(' ')) |
|
|
|
output = [] |
|
for m in models: |
|
row = [m, vocab_size[m], word_count, token_counts[m], f"{token_counts[m] / word_count:0.2f}"] |
|
output.append(row) |
|
|
|
return output |
|
|
|
|
|
def generate_split_token_table(text): |
|
if not text: |
|
return gr.Dataframe() |
|
|
|
table = generate_tokenizer_table(text) |
|
return gr.Dataframe( |
|
table, |
|
headers=['tokenizer', 'v size', '#word', '#token', '#tokens/word'], |
|
datatype=["str", "number", "str"], |
|
row_count=len(models), |
|
col_count=(5, "fixed"), |
|
) |
|
|
|
|
|
with gr.Blocks() as sutra_token_count: |
|
gr.Markdown( |
|
""" |
|
# Multilingual Tokenizer Specs & Stats. |
|
## Tokenize paragraphs in multiple languages and compare token counts. |
|
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison) |
|
""") |
|
textbox = gr.Textbox(label="Input Text") |
|
submit_button = gr.Button("Submit") |
|
output = gr.Dataframe() |
|
examples = [ |
|
[' '.join(test_phrase_set_long_1)], |
|
[' '.join(test_phrase_set_long_2)], |
|
[' '.join(test_phrase_set_long_3)], |
|
] |
|
gr.Examples(examples=examples, inputs=[textbox]) |
|
submit_button.click(generate_split_token_table, inputs=[textbox], outputs=[output]) |
|
|
|
|
|
def generate_tokens_table(text): |
|
table = generate_tokens_as_table(text) |
|
cols = len(table[0]) |
|
return gr.Dataframe( |
|
table, |
|
headers=['model'] + [str(i) for i in range(cols - 1)], |
|
row_count=2, |
|
col_count=(cols, "fixed"), |
|
) |
|
|
|
|
|
with gr.Blocks() as sutra_tokenize: |
|
gr.Markdown( |
|
""" |
|
# Multilingual Tokenizer Sentence Inspector. |
|
## Tokenize a sentence with various tokenizers and inspect how it's broken down. |
|
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison) |
|
""") |
|
textbox = gr.Textbox(label="Input Text") |
|
submit_button = gr.Button("Submit") |
|
output = gr.Dataframe() |
|
examples = test_phrase_set |
|
gr.Examples(examples=examples, inputs=[textbox]) |
|
submit_button.click(generate_tokens_table, inputs=[textbox], outputs=[output]) |
|
|
|
|
|
if __name__ == '__main__': |
|
with gr.Blocks(analytics_enabled=False) as demo: |
|
with gr.Row(): |
|
gr.Markdown( |
|
""" |
|
## <img src="https://raw.githubusercontent.com/SAGAR-TAMANG/sagar-tamang-official-website-new/master/img/pi.jpg" height="20"/> |
|
""" |
|
) |
|
with gr.Row(): |
|
gr.TabbedInterface( |
|
interface_list=[sutra_tokenize, sutra_token_count], |
|
tab_names=["Tokenize Text", "Tokenize Paragraphs"] |
|
) |
|
|
|
demo.queue(default_concurrency_limit=5).launch( |
|
server_name="0.0.0.0", |
|
allowed_paths=["/"], |
|
) |
|
|