from datasets import load_dataset import numpy as np from transformers import AutoTokenizer import gradio as gr lang_codes = """Acehnese (Arabic script) | ace_Arab Acehnese (Latin script) | ace_Latn Mesopotamian Arabic | acm_Arab Ta’izzi-Adeni Arabic | acq_Arab Tunisian Arabic | aeb_Arab Afrikaans | afr_Latn South Levantine Arabic | ajp_Arab Akan | aka_Latn Amharic | amh_Ethi North Levantine Arabic | apc_Arab Modern Standard Arabic | arb_Arab Modern Standard Arabic (Romanized) | arb_Latn Najdi Arabic | ars_Arab Moroccan Arabic | ary_Arab Egyptian Arabic | arz_Arab Assamese | asm_Beng Asturian | ast_Latn Awadhi | awa_Deva Central Aymara | ayr_Latn South Azerbaijani | azb_Arab North Azerbaijani | azj_Latn Bashkir | bak_Cyrl Bambara | bam_Latn Balinese | ban_Latn Belarusian | bel_Cyrl Bemba | bem_Latn Bengali | ben_Beng Bhojpuri | bho_Deva Banjar (Arabic script) | bjn_Arab Banjar (Latin script) | bjn_Latn Standard Tibetan | bod_Tibt Bosnian | bos_Latn Buginese | bug_Latn Bulgarian | bul_Cyrl Catalan | cat_Latn Cebuano | ceb_Latn Czech | ces_Latn Chokwe | cjk_Latn Central Kurdish | ckb_Arab Crimean Tatar | crh_Latn Welsh | cym_Latn Danish | dan_Latn German | deu_Latn Southwestern Dinka | dik_Latn Dyula | dyu_Latn Dzongkha | dzo_Tibt Greek | ell_Grek English | eng_Latn Esperanto | epo_Latn Estonian | est_Latn Basque | eus_Latn Ewe | ewe_Latn Faroese | fao_Latn Fijian | fij_Latn Finnish | fin_Latn Fon | fon_Latn French | fra_Latn Friulian | fur_Latn Nigerian Fulfulde | fuv_Latn Scottish Gaelic | gla_Latn Irish | gle_Latn Galician | glg_Latn Guarani | grn_Latn Gujarati | guj_Gujr Haitian Creole | hat_Latn Hausa | hau_Latn Hebrew | heb_Hebr Hindi | hin_Deva Chhattisgarhi | hne_Deva Croatian | hrv_Latn Hungarian | hun_Latn Armenian | hye_Armn Igbo | ibo_Latn Ilocano | ilo_Latn Indonesian | ind_Latn Icelandic | isl_Latn Italian | ita_Latn Javanese | jav_Latn Japanese | jpn_Jpan Kabyle | kab_Latn Jingpho | kac_Latn Kamba | kam_Latn Kannada | kan_Knda Kashmiri (Arabic script) | kas_Arab Kashmiri (Devanagari script) | kas_Deva Georgian | kat_Geor Central Kanuri (Arabic script) | knc_Arab Central Kanuri (Latin script) | knc_Latn Kazakh | kaz_Cyrl Kabiyè | kbp_Latn Kabuverdianu | kea_Latn Khmer | khm_Khmr Kikuyu | kik_Latn Kinyarwanda | kin_Latn Kyrgyz | kir_Cyrl Kimbundu | kmb_Latn Northern Kurdish | kmr_Latn Kikongo | kon_Latn Korean | kor_Hang Lao | lao_Laoo Ligurian | lij_Latn Limburgish | lim_Latn Lingala | lin_Latn Lithuanian | lit_Latn Lombard | lmo_Latn Latgalian | ltg_Latn Luxembourgish | ltz_Latn Luba-Kasai | lua_Latn Ganda | lug_Latn Luo | luo_Latn Mizo | lus_Latn Standard Latvian | lvs_Latn Magahi | mag_Deva Maithili | mai_Deva Malayalam | mal_Mlym Marathi | mar_Deva Minangkabau (Arabic script) | min_Arab Minangkabau (Latin script) | min_Latn Macedonian | mkd_Cyrl Plateau Malagasy | plt_Latn Maltese | mlt_Latn Meitei (Bengali script) | mni_Beng Halh Mongolian | khk_Cyrl Mossi | mos_Latn Maori | mri_Latn Burmese | mya_Mymr Dutch | nld_Latn Norwegian Nynorsk | nno_Latn Norwegian Bokmål | nob_Latn Nepali | npi_Deva Northern Sotho | nso_Latn Nuer | nus_Latn Nyanja | nya_Latn Occitan | oci_Latn West Central Oromo | gaz_Latn Odia | ory_Orya Pangasinan | pag_Latn Eastern Panjabi | pan_Guru Papiamento | pap_Latn Western Persian | pes_Arab Polish | pol_Latn Portuguese | por_Latn Dari | prs_Arab Southern Pashto | pbt_Arab Ayacucho Quechua | quy_Latn Romanian | ron_Latn Rundi | run_Latn Russian | rus_Cyrl Sango | sag_Latn Sanskrit | san_Deva Santali | sat_Olck Sicilian | scn_Latn Shan | shn_Mymr Sinhala | sin_Sinh Slovak | slk_Latn Slovenian | slv_Latn Samoan | smo_Latn Shona | sna_Latn Sindhi | snd_Arab Somali | som_Latn Southern Sotho | sot_Latn Spanish | spa_Latn Tosk Albanian | als_Latn Sardinian | srd_Latn Serbian | srp_Cyrl Swati | ssw_Latn Sundanese | sun_Latn Swedish | swe_Latn Swahili | swh_Latn Silesian | szl_Latn Tamil | tam_Taml Tatar | tat_Cyrl Telugu | tel_Telu Tajik | tgk_Cyrl Tagalog | tgl_Latn Thai | tha_Thai Tigrinya | tir_Ethi Tamasheq (Latin script) | taq_Latn Tamasheq (Tifinagh script) | taq_Tfng Tok Pisin | tpi_Latn Tswana | tsn_Latn Tsonga | tso_Latn Turkmen | tuk_Latn Tumbuka | tum_Latn Turkish | tur_Latn Twi | twi_Latn Central Atlas Tamazight | tzm_Tfng Uyghur | uig_Arab Ukrainian | ukr_Cyrl Umbundu | umb_Latn Urdu | urd_Arab Northern Uzbek | uzn_Latn Venetian | vec_Latn Vietnamese | vie_Latn Waray | war_Latn Wolof | wol_Latn Xhosa | xho_Latn Eastern Yiddish | ydd_Hebr Yoruba | yor_Latn Yue Chinese | yue_Hant Chinese (Simplified) | zho_Hans Chinese (Traditional) | zho_Hant Standard Malay | zsm_Latn Zulu | zul_Latn""" lang_codes = {l.split(" | ")[0]: l.split(" | ")[1] for l in lang_codes.split("\n")} dataset = load_dataset("facebook/flores", "all", trust_remote_code=True)["dev"] data_per_lang = {} for d in dataset: for full, code in lang_codes.items(): k = f"sentence_{code}" data_per_lang[full] = data_per_lang.get(code, []) + [d[k]] def get_results(tokenizer_name, base_lang, comp_lang, HF_token=""): tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, token=HF_token if HF_token != "" else False) base_data = data_per_lang[base_lang] comp_data = data_per_lang[comp_lang] base_results = [] comp_results = [] for base_d, comp_d in zip(base_data, comp_data): input_ids = tokenizer(base_d, return_tensors="np")[0] base_results.append(len(input_ids)) input_ids = tokenizer(comp_d, return_tensors="np")[0] comp_results.append(len(input_ids)) agg_base = np.array(base_results).mean() agg_comp = np.array(comp_results).mean() token_ratio = (agg_comp / agg_base) print(token_ratio) if token_ratio < 1.: adverb = "less" token_ratio = (1. - token_ratio) * 100 else: adverb = "more" token_ratio = (token_ratio - 1.) * 100 output = f"**You need {round(token_ratio, 3)}% {adverb} tokens to represent your text in {comp_lang} than in {base_lang}.**" return output with gr.Blocks() as demo: with gr.Row(): gr.Markdown("""