--- license: cc-by-nc-4.0 language: - ace - ace - acm - acq - aeb - af - ajp - ak - am - apc - ar - ars - ary - arz - as - ast - awa - ay - azb - azj - ba - bm - ban - be - bem - bn - bho - bjn - bjn - bo - bs - bug - bg - ca - ceb - cs - cjk - ckb - crh - cy - da - de - dik - dyu - dz - el - en - eo - et - eu - ee - fo - fa - fj - fi - fon - fr - fur - ff - gd - ga - gl - gn - gu - ht - ha - he - hi - hne - hr - hu - hy - ig - ilo - id - is - it - jv - ja - kab - kac - kam - kn - ks - ks - ka - kr - kr - kk - kbp - kea - km - ki - rw - ky - kmb - kg - ko - kmr - lo - lv - lij - li - ln - lt - lmo - ltg - lb - lua - lg - luo - lus - mag - mai - ml - mr - min - mk - plt - mt - mni - mn - mos - mi - ms - my - nl - nn - nb - ne - nso - nus - ny - oc - gaz - ory - pag - pa - pap - pl - pt - prs - pbt - qu - ro - rn - ru - sg - sa - sat - scn - shn - si - sk - sl - sm - sn - sd - so - st - es - als - sc - sr - ss - su - sv - sw - szl - ta - tt - te - tg - tl - th - ti - taq - taq - tpi - tn - ts - tk - tum - tr - tw - tzm - ug - uk - umb - ur - uz - vec - vi - war - wo - xh - yi - yo - yue - zh - zh - zu language_details: >- ace_Arab, ace_Latn, acm_Arab, acq_Arab, aeb_Arab, afr_Latn, ajp_Arab, aka_Latn, amh_Ethi, apc_Arab, arb_Arab, ars_Arab, ary_Arab, arz_Arab, asm_Beng, ast_Latn, awa_Deva, ayr_Latn, azb_Arab, azj_Latn, bak_Cyrl, bam_Latn, ban_Latn,bel_Cyrl, bem_Latn, ben_Beng, bho_Deva, bjn_Arab, bjn_Latn, bod_Tibt, bos_Latn, bug_Latn, bul_Cyrl, cat_Latn, ceb_Latn, ces_Latn, cjk_Latn, ckb_Arab, crh_Latn, cym_Latn, dan_Latn, deu_Latn, dik_Latn, dyu_Latn, dzo_Tibt, ell_Grek, eng_Latn, epo_Latn, est_Latn, eus_Latn, ewe_Latn, fao_Latn, pes_Arab, fij_Latn, fin_Latn, fon_Latn, fra_Latn, fur_Latn, fuv_Latn, gla_Latn, gle_Latn, glg_Latn, grn_Latn, guj_Gujr, hat_Latn, hau_Latn, heb_Hebr, hin_Deva, hne_Deva, hrv_Latn, hun_Latn, hye_Armn, ibo_Latn, ilo_Latn, ind_Latn, isl_Latn, ita_Latn, jav_Latn, jpn_Jpan, kab_Latn, kac_Latn, kam_Latn, kan_Knda, kas_Arab, kas_Deva, kat_Geor, knc_Arab, knc_Latn, kaz_Cyrl, kbp_Latn, kea_Latn, khm_Khmr, kik_Latn, kin_Latn, kir_Cyrl, kmb_Latn, kon_Latn, kor_Hang, kmr_Latn, lao_Laoo, lvs_Latn, lij_Latn, lim_Latn, lin_Latn, lit_Latn, lmo_Latn, ltg_Latn, ltz_Latn, lua_Latn, lug_Latn, luo_Latn, lus_Latn, mag_Deva, mai_Deva, mal_Mlym, mar_Deva, min_Latn, mkd_Cyrl, plt_Latn, mlt_Latn, mni_Beng, khk_Cyrl, mos_Latn, mri_Latn, zsm_Latn, mya_Mymr, nld_Latn, nno_Latn, nob_Latn, npi_Deva, nso_Latn, nus_Latn, nya_Latn, oci_Latn, gaz_Latn, ory_Orya, pag_Latn, pan_Guru, pap_Latn, pol_Latn, por_Latn, prs_Arab, pbt_Arab, quy_Latn, ron_Latn, run_Latn, rus_Cyrl, sag_Latn, san_Deva, sat_Beng, scn_Latn, shn_Mymr, sin_Sinh, slk_Latn, slv_Latn, smo_Latn, sna_Latn, snd_Arab, som_Latn, sot_Latn, spa_Latn, als_Latn, srd_Latn, srp_Cyrl, ssw_Latn, sun_Latn, swe_Latn, swh_Latn, szl_Latn, tam_Taml, tat_Cyrl, tel_Telu, tgk_Cyrl, tgl_Latn, tha_Thai, tir_Ethi, taq_Latn, taq_Tfng, tpi_Latn, tsn_Latn, tso_Latn, tuk_Latn, tum_Latn, tur_Latn, twi_Latn, tzm_Tfng, uig_Arab, ukr_Cyrl, umb_Latn, urd_Arab, uzn_Latn, vec_Latn, vie_Latn, war_Latn, wol_Latn, xho_Latn, ydd_Hebr, yor_Latn, yue_Hant, zho_Hans, zho_Hant, zul_Latn pipeline_tag: sentence-similarity --- This is a port of the multilingual SONAR text encoder (https://huggingface.co/facebook/SONAR) to the `transformers` format from `fairseq2`. Its embeddings are expected be equal to those the official implementation (https://github.com/facebookresearch/SONAR), but the latter stays the source of truth. The encoder supports the same 202 languages as [NLLB-200](https://huggingface.co/facebook/nllb-200-distilled-600M) (see also [the source model card](https://github.com/facebookresearch/SONAR/blob/main/sonar/store/cards/text_sonar_basic_encoder.yaml#L14) and [FLORES-200 lang code mapping](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)). How to compute embeddings: ```Python # !pip install transformers sentencepiece -q import torch from transformers import AutoTokenizer from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder model_name = "cointegrated/SONAR_200_text_encoder" encoder = M2M100Encoder.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) def encode_mean_pool(texts, tokenizer, encoder, lang='eng_Latn', norm=False): tokenizer.src_lang = lang with torch.inference_mode(): batch = tokenizer(texts, return_tensors='pt', padding=True) seq_embs = encoder(**batch).last_hidden_state mask = batch.attention_mask mean_emb = (seq_embs * mask.unsqueeze(-1)).sum(1) / mask.unsqueeze(-1).sum(1) if norm: mean_emb = torch.nn.functional.normalize(mean_emb) return mean_emb sentences = ['My name is SONAR.', 'I can embed the sentences into vectorial space.'] embs = encode_mean_pool(sentences, tokenizer, encoder, lang="eng_Latn") print(embs.shape) # torch.Size([2, 1024]) print(embs) # tensor([[-0.0053, 0.0020, -0.0006, ..., 0.0094, -0.0009, 0.0070], # [-0.0003, -0.0071, 0.0076, ..., 0.0055, 0.0022, -0.0083]]) ``` For advanced examples of usage, please take a look at the readme in https://github.com/facebookresearch/SONAR. The model was repacked [in this notebook](https://colab.research.google.com/drive/1s6JuQWaMnWXyFdni1AAw-Z48k8YoKwG7?usp=sharing).