from datasets import load_dataset import numpy as np from transformers import AutoTokenizer import gradio as gr lang_codes = """Acehnese (Arabic script) | ace_Arab Acehnese (Latin script) | ace_Latn Mesopotamian Arabic | acm_Arab Ta’izzi-Adeni Arabic | acq_Arab Tunisian Arabic | aeb_Arab Afrikaans | afr_Latn South Levantine Arabic | ajp_Arab Akan | aka_Latn Amharic | amh_Ethi North Levantine Arabic | apc_Arab Modern Standard Arabic | arb_Arab Modern Standard Arabic (Romanized) | arb_Latn Najdi Arabic | ars_Arab Moroccan Arabic | ary_Arab Egyptian Arabic | arz_Arab Assamese | asm_Beng Asturian | ast_Latn Awadhi | awa_Deva Central Aymara | ayr_Latn South Azerbaijani | azb_Arab North Azerbaijani | azj_Latn Bashkir | bak_Cyrl Bambara | bam_Latn Balinese | ban_Latn Belarusian | bel_Cyrl Bemba | bem_Latn Bengali | ben_Beng Bhojpuri | bho_Deva Banjar (Arabic script) | bjn_Arab Banjar (Latin script) | bjn_Latn Standard Tibetan | bod_Tibt Bosnian | bos_Latn Buginese | bug_Latn Bulgarian | bul_Cyrl Catalan | cat_Latn Cebuano | ceb_Latn Czech | ces_Latn Chokwe | cjk_Latn Central Kurdish | ckb_Arab Crimean Tatar | crh_Latn Welsh | cym_Latn Danish | dan_Latn German | deu_Latn Southwestern Dinka | dik_Latn Dyula | dyu_Latn Dzongkha | dzo_Tibt Greek | ell_Grek English | eng_Latn Esperanto | epo_Latn Estonian | est_Latn Basque | eus_Latn Ewe | ewe_Latn Faroese | fao_Latn Fijian | fij_Latn Finnish | fin_Latn Fon | fon_Latn French | fra_Latn Friulian | fur_Latn Nigerian Fulfulde | fuv_Latn Scottish Gaelic | gla_Latn Irish | gle_Latn Galician | glg_Latn Guarani | grn_Latn Gujarati | guj_Gujr Haitian Creole | hat_Latn Hausa | hau_Latn Hebrew | heb_Hebr Hindi | hin_Deva Chhattisgarhi | hne_Deva Croatian | hrv_Latn Hungarian | hun_Latn Armenian | hye_Armn Igbo | ibo_Latn Ilocano | ilo_Latn Indonesian | ind_Latn Icelandic | isl_Latn Italian | ita_Latn Javanese | jav_Latn Japanese | jpn_Jpan Kabyle | kab_Latn Jingpho | kac_Latn Kamba | kam_Latn Kannada | kan_Knda Kashmiri (Arabic script) | kas_Arab Kashmiri (Devanagari script) | kas_Deva Georgian | kat_Geor Central Kanuri (Arabic script) | knc_Arab Central Kanuri (Latin script) | knc_Latn Kazakh | kaz_Cyrl Kabiyè | kbp_Latn Kabuverdianu | kea_Latn Khmer | khm_Khmr Kikuyu | kik_Latn Kinyarwanda | kin_Latn Kyrgyz | kir_Cyrl Kimbundu | kmb_Latn Northern Kurdish | kmr_Latn Kikongo | kon_Latn Korean | kor_Hang Lao | lao_Laoo Ligurian | lij_Latn Limburgish | lim_Latn Lingala | lin_Latn Lithuanian | lit_Latn Lombard | lmo_Latn Latgalian | ltg_Latn Luxembourgish | ltz_Latn Luba-Kasai | lua_Latn Ganda | lug_Latn Luo | luo_Latn Mizo | lus_Latn Standard Latvian | lvs_Latn Magahi | mag_Deva Maithili | mai_Deva Malayalam | mal_Mlym Marathi | mar_Deva Minangkabau (Arabic script) | min_Arab Minangkabau (Latin script) | min_Latn Macedonian | mkd_Cyrl Plateau Malagasy | plt_Latn Maltese | mlt_Latn Meitei (Bengali script) | mni_Beng Halh Mongolian | khk_Cyrl Mossi | mos_Latn Maori | mri_Latn Burmese | mya_Mymr Dutch | nld_Latn Norwegian Nynorsk | nno_Latn Norwegian Bokmål | nob_Latn Nepali | npi_Deva Northern Sotho | nso_Latn Nuer | nus_Latn Nyanja | nya_Latn Occitan | oci_Latn West Central Oromo | gaz_Latn Odia | ory_Orya Pangasinan | pag_Latn Eastern Panjabi | pan_Guru Papiamento | pap_Latn Western Persian | pes_Arab Polish | pol_Latn Portuguese | por_Latn Dari | prs_Arab Southern Pashto | pbt_Arab Ayacucho Quechua | quy_Latn Romanian | ron_Latn Rundi | run_Latn Russian | rus_Cyrl Sango | sag_Latn Sanskrit | san_Deva Santali | sat_Olck Sicilian | scn_Latn Shan | shn_Mymr Sinhala | sin_Sinh Slovak | slk_Latn Slovenian | slv_Latn Samoan | smo_Latn Shona | sna_Latn Sindhi | snd_Arab Somali | som_Latn Southern Sotho | sot_Latn Spanish | spa_Latn Tosk Albanian | als_Latn Sardinian | srd_Latn Serbian | srp_Cyrl Swati | ssw_Latn Sundanese | sun_Latn Swedish | swe_Latn Swahili | swh_Latn Silesian | szl_Latn Tamil | tam_Taml Tatar | tat_Cyrl Telugu | tel_Telu Tajik | tgk_Cyrl Tagalog | tgl_Latn Thai | tha_Thai Tigrinya | tir_Ethi Tamasheq (Latin script) | taq_Latn Tamasheq (Tifinagh script) | taq_Tfng Tok Pisin | tpi_Latn Tswana | tsn_Latn Tsonga | tso_Latn Turkmen | tuk_Latn Tumbuka | tum_Latn Turkish | tur_Latn Twi | twi_Latn Central Atlas Tamazight | tzm_Tfng Uyghur | uig_Arab Ukrainian | ukr_Cyrl Umbundu | umb_Latn Urdu | urd_Arab Northern Uzbek | uzn_Latn Venetian | vec_Latn Vietnamese | vie_Latn Waray | war_Latn Wolof | wol_Latn Xhosa | xho_Latn Eastern Yiddish | ydd_Hebr Yoruba | yor_Latn Yue Chinese | yue_Hant Chinese (Simplified) | zho_Hans Chinese (Traditional) | zho_Hant Standard Malay | zsm_Latn Zulu | zul_Latn""" lang_codes = {l.split(" | ")[0]: l.split(" | ")[1] for l in lang_codes.split("\n")} dataset = load_dataset("facebook/flores", "all", trust_remote_code=True)["dev"] data_per_lang = {} for d in dataset: for full, code in lang_codes.items(): k = f"sentence_{code}" data_per_lang[full] = data_per_lang.get(code, []) + [d[k]] def get_results(tokenizer_name, base_lang, comp_lang, HF_token=""): tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, token=HF_token if HF_token != "" else False) base_data = data_per_lang[base_lang] comp_data = data_per_lang[comp_lang] base_results = [] comp_results = [] for base_d, comp_d in zip(base_data, comp_data): input_ids = tokenizer(base_d, return_tensors="np")[0] base_results.append(len(input_ids)) input_ids = tokenizer(comp_d, return_tensors="np")[0] comp_results.append(len(input_ids)) agg_base = np.array(base_results).mean() agg_comp = np.array(comp_results).mean() token_ratio = (agg_comp / agg_base) print(token_ratio) if token_ratio < 1.: adverb = "less" token_ratio = (1. - token_ratio) * 100 else: adverb = "more" token_ratio = (token_ratio - 1.) * 100 output = f"**You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}.**" return output with gr.Blocks() as demo: with gr.Row(): gr.Markdown("""

Language tokenization comparison

This tool will help you calculate the how many more or less tokens you need to tokenize text in different languages. To perform this comparison we are using [FLORES](https://github.com/facebookresearch/flores/tree/main) dataset, developed by meta, which presents translations between English and low-resource languages. We first tokenize around 1000 texts to the base language and to the language we want to compare. After that, we get average of inputs_ids lenght.""") with gr.Row(): with gr.Column(): tokenizer = gr.Textbox(label="Tokenizer name", value="bert-base-cased") with gr.Column(): HF_token = gr.Textbox(label="your HF Token") with gr.Row(): with gr.Column(): base_lang = gr.Dropdown( list(lang_codes.keys()), label="Languages" ) with gr.Column(): comp_lang = gr.Dropdown( list(lang_codes.keys()), label="Languages" ) with gr.Row(): btn = gr.Button("Submit") out_text = gr.Markdown() btn.click( get_results, inputs=[tokenizer, base_lang, comp_lang, HF_token], outputs=[out_text], api_name=False, ) demo.launch()