import logging # import os import tiktoken from transformers import AutoTokenizer import gradio as gr logger = logging.getLogger(__name__) # noqa # hugging face # hf_token = os.getenv('HUGGINGFACE_TOKEN') # HfApi().login(token=hf_token) def load_test_phrases(filename): with open(f"./data/{filename}", "r", encoding="utf-8") as file: return file.read().splitlines() models = ["Xenova/claude-tokenizer", # Anthropic "meta-llama/Llama-2-7b-chat-hf", # LLAMA-2 "beomi/llama-2-ko-7b", # LLAMA-2-ko "ai4bharat/Airavata", # ARIVATA "openaccess-ai-collective/tiny-mistral", # Mistral "gpt-3.5-turbo", # GPT3.5 "meta-llama/Meta-Llama-3-8B-Instruct", # LLAMA-3 "CohereForAI/aya-23-8B", # AYA "google/gemma-1.1-2b-it", # GEMMA "gpt-4o", # GPT4o "TWO/sutra-mlt256-v2", # SUTRA "tamang0000/assamese-tokenizer-50k" # Assamese ] test_phrase_set = [ "মই আজিৰ পাছত হ’ব লগা হাঁহিৰ বাবে ওলাই থাকিম", "আমি চন্দ্ৰলৈ ৰকেট যাত্ৰাত আছোঁ", "পাঁচখন বাক্যৰে নিউট্ৰন বিকিৰণৰ বৰ্ণনা দিয়ক", # Assamese "আমাক পাঁচখন বাক্যৰে নিউট্ৰন বিকিৰণৰ বৰ্ণনা দিয়ক", "মোৰ বন্ধুটোৱে চাৰিটা পুথি পঢ়িছে", # Assamese "মোৰ ঘৰখন গাঁওখনৰ আটাইতকৈ বেছি ডাঙৰ", # Assamese "আজিৰে পৰা মই সৰু সৰু কামবোৰ কৰি থাকিম", # Assamese "তেওঁৰ মাতবোৰ আৰু শাৰীবোৰ সলনি হোৱা দেখি চমক লাগিল", # Assamese ] test_phrase_set_long_1 = load_test_phrases('multilingualphrases01-as.txt') test_phrase_set_long_2 = load_test_phrases('multilingualphrases02-as.txt') # test_phrase_set_long_3 = load_test_phrases('multilingualphrases03.txt') def generate_tokens_as_table(text): table = [] for model in models: if 'gpt' not in model: tokenizer = AutoTokenizer.from_pretrained(model) tokens = tokenizer.encode(text, add_special_tokens=False) else: tokenizer = tiktoken.encoding_for_model(model) tokens = tokenizer.encode(text) decoded = [tokenizer.decode([t]) for t in tokens] table.append([model] + decoded) return table def generate_tokenizer_table(text): if not text: return [] token_counts = {model: 0 for model in models} vocab_size = {model: 0 for model in models} for model in models: if 'gpt' not in model: tokenizer = AutoTokenizer.from_pretrained(model) vocab_size[model] = tokenizer.vocab_size else: tokenizer = tiktoken.encoding_for_model(model) vocab_size[model] = tokenizer.n_vocab token_counts[model] += len(tokenizer.encode(text)) word_count = len(text.split(' ')) output = [] for m in models: row = [m, vocab_size[m], word_count, token_counts[m], f"{token_counts[m] / word_count:0.2f}"] output.append(row) return output def generate_split_token_table(text): if not text: return gr.Dataframe() table = generate_tokenizer_table(text) return gr.Dataframe( table, headers=['tokenizer', 'v size', '#word', '#token', '#tokens/word'], datatype=["str", "number", "str"], row_count=len(models), col_count=(5, "fixed"), ) with gr.Blocks() as sutra_token_count: gr.Markdown( """ # Multilingual Tokenizer Specs & Stats. ## Tokenize paragraphs in multiple languages and compare token counts. Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison) """) textbox = gr.Textbox(label="Input Text") submit_button = gr.Button("Submit") output = gr.Dataframe() examples = [ [' '.join(test_phrase_set_long_1)], [' '.join(test_phrase_set_long_2)], # [' '.join(test_phrase_set_long_3)], ] gr.Examples(examples=examples, inputs=[textbox]) submit_button.click(generate_split_token_table, inputs=[textbox], outputs=[output]) def generate_tokens_table(text): table = generate_tokens_as_table(text) cols = len(table[0]) return gr.Dataframe( table, headers=['model'] + [str(i) for i in range(cols - 1)], row_count=2, col_count=(cols, "fixed"), ) with gr.Blocks() as sutra_tokenize: gr.Markdown( """ # Multilingual Tokenizer Sentence Inspector. ## Tokenize a sentence with various tokenizers and inspect how it's broken down. Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison) """) textbox = gr.Textbox(label="Input Text") submit_button = gr.Button("Submit") output = gr.Dataframe() examples = test_phrase_set gr.Examples(examples=examples, inputs=[textbox]) submit_button.click(generate_tokens_table, inputs=[textbox], outputs=[output]) if __name__ == '__main__': with gr.Blocks(analytics_enabled=False) as demo: with gr.Row(): gr.Markdown( """ ## """ ) with gr.Row(): gr.TabbedInterface( interface_list=[sutra_tokenize, sutra_token_count], tab_names=["Tokenize Text", "Tokenize Paragraphs"] ) demo.queue(default_concurrency_limit=5).launch( server_name="0.0.0.0", allowed_paths=["/"], )