cointegrated commited on
Commit
71ff111
1 Parent(s): 3e140a9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +232 -204
README.md CHANGED
@@ -1,216 +1,244 @@
1
  ---
2
  license: cc-by-nc-4.0
3
- language_bcp47: # TODO: convert them to traditional formats
4
- - ace_Arab
5
- - ace_Latn
6
- - acm_Arab
7
- - acq_Arab
8
- - aeb_Arab
9
- - afr_Latn
10
- - ajp_Arab
11
- - aka_Latn
12
- - amh_Ethi
13
- - apc_Arab
14
- - arb_Arab
15
- - ars_Arab
16
- - ary_Arab
17
- - arz_Arab
18
- - asm_Beng
19
- - ast_Latn
20
- - awa_Deva
21
- - ayr_Latn
22
- - azb_Arab
23
- - azj_Latn
24
- - bak_Cyrl
25
- - bam_Latn
26
- - ban_Latn
27
- - bel_Cyrl
28
- - bem_Latn
29
- - ben_Beng
30
- - bho_Deva
31
- - bjn_Arab
32
- - bjn_Latn
33
- - bod_Tibt
34
- - bos_Latn
35
- - bug_Latn
36
- - bul_Cyrl
37
- - cat_Latn
38
- - ceb_Latn
39
- - ces_Latn
40
- - cjk_Latn
41
- - ckb_Arab
42
- - crh_Latn
43
- - cym_Latn
44
- - dan_Latn
45
- - deu_Latn
46
- - dik_Latn
47
- - dyu_Latn
48
- - dzo_Tibt
49
- - ell_Grek
50
- - eng_Latn
51
- - epo_Latn
52
- - est_Latn
53
- - eus_Latn
54
- - ewe_Latn
55
- - fao_Latn
56
- - pes_Arab
57
- - fij_Latn
58
- - fin_Latn
59
- - fon_Latn
60
- - fra_Latn
61
- - fur_Latn
62
- - fuv_Latn
63
- - gla_Latn
64
- - gle_Latn
65
- - glg_Latn
66
- - grn_Latn
67
- - guj_Gujr
68
- - hat_Latn
69
- - hau_Latn
70
- - heb_Hebr
71
- - hin_Deva
72
- - hne_Deva
73
- - hrv_Latn
74
- - hun_Latn
75
- - hye_Armn
76
- - ibo_Latn
77
- - ilo_Latn
78
- - ind_Latn
79
- - isl_Latn
80
- - ita_Latn
81
- - jav_Latn
82
- - jpn_Jpan
83
- - kab_Latn
84
- - kac_Latn
85
- - kam_Latn
86
- - kan_Knda
87
- - kas_Arab
88
- - kas_Deva
89
- - kat_Geor
90
- - knc_Arab
91
- - knc_Latn
92
- - kaz_Cyrl
93
- - kbp_Latn
94
- - kea_Latn
95
- - khm_Khmr
96
- - kik_Latn
97
- - kin_Latn
98
- - kir_Cyrl
99
- - kmb_Latn
100
- - kon_Latn
101
- - kor_Hang
102
- - kmr_Latn
103
- - lao_Laoo
104
- - lvs_Latn
105
- - lij_Latn
106
- - lim_Latn
107
- - lin_Latn
108
- - lit_Latn
109
- - lmo_Latn
110
- - ltg_Latn
111
- - ltz_Latn
112
- - lua_Latn
113
- - lug_Latn
114
- - luo_Latn
115
- - lus_Latn
116
- - mag_Deva
117
- - mai_Deva
118
- - mal_Mlym
119
- - mar_Deva
120
- - min_Latn
121
- - mkd_Cyrl
122
- - plt_Latn
123
- - mlt_Latn
124
- - mni_Beng
125
- - khk_Cyrl
126
- - mos_Latn
127
- - mri_Latn
128
- - zsm_Latn
129
- - mya_Mymr
130
- - nld_Latn
131
- - nno_Latn
132
- - nob_Latn
133
- - npi_Deva
134
- - nso_Latn
135
- - nus_Latn
136
- - nya_Latn
137
- - oci_Latn
138
- - gaz_Latn
139
- - ory_Orya
140
- - pag_Latn
141
- - pan_Guru
142
- - pap_Latn
143
- - pol_Latn
144
- - por_Latn
145
- - prs_Arab
146
- - pbt_Arab
147
- - quy_Latn
148
- - ron_Latn
149
- - run_Latn
150
- - rus_Cyrl
151
- - sag_Latn
152
- - san_Deva
153
- - sat_Beng
154
- - scn_Latn
155
- - shn_Mymr
156
- - sin_Sinh
157
- - slk_Latn
158
- - slv_Latn
159
- - smo_Latn
160
- - sna_Latn
161
- - snd_Arab
162
- - som_Latn
163
- - sot_Latn
164
- - spa_Latn
165
- - als_Latn
166
- - srd_Latn
167
- - srp_Cyrl
168
- - ssw_Latn
169
- - sun_Latn
170
- - swe_Latn
171
- - swh_Latn
172
- - szl_Latn
173
- - tam_Taml
174
- - tat_Cyrl
175
- - tel_Telu
176
- - tgk_Cyrl
177
- - tgl_Latn
178
- - tha_Thai
179
- - tir_Ethi
180
- - taq_Latn
181
- - taq_Tfng
182
- - tpi_Latn
183
- - tsn_Latn
184
- - tso_Latn
185
- - tuk_Latn
186
- - tum_Latn
187
- - tur_Latn
188
- - twi_Latn
189
- - tzm_Tfng
190
- - uig_Arab
191
- - ukr_Cyrl
192
- - umb_Latn
193
- - urd_Arab
194
- - uzn_Latn
195
- - vec_Latn
196
- - vie_Latn
197
- - war_Latn
198
- - wol_Latn
199
- - xho_Latn
200
- - ydd_Hebr
201
- - yor_Latn
202
- - yue_Hant
203
- - zho_Hans
204
- - zho_Hant
205
- - zul_Latn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  ---
207
  This is a port of the multilingual text encoder from https://huggingface.co/facebook/SONAR to `transformers` format from `fairseq2`.
208
 
 
 
 
 
209
  For advanced examples of usage, please take a look at https://github.com/facebookresearch/SONAR.
210
 
211
  How to use:
212
  ```Python
213
- !pip install transformers sentencepiece -q
214
 
215
  import torch
216
  from transformers import AutoTokenizer
 
1
  ---
2
  license: cc-by-nc-4.0
3
+ language:
4
+ - ace
5
+ - acm
6
+ - acq
7
+ - aeb
8
+ - af
9
+ - ajp
10
+ - ak
11
+ - als
12
+ - am
13
+ - apc
14
+ - ar
15
+ - ars
16
+ - ary
17
+ - arz
18
+ - as
19
+ - ast
20
+ - awa
21
+ - ayr
22
+ - azb
23
+ - azj
24
+ - ba
25
+ - bm
26
+ - ban
27
+ - be
28
+ - bem
29
+ - bn
30
+ - bho
31
+ - bjn
32
+ - bo
33
+ - bs
34
+ - bug
35
+ - bg
36
+ - ca
37
+ - ceb
38
+ - cs
39
+ - cjk
40
+ - ckb
41
+ - crh
42
+ - cy
43
+ - da
44
+ - de
45
+ - dik
46
+ - dyu
47
+ - dz
48
+ - el
49
+ - en
50
+ - eo
51
+ - et
52
+ - eu
53
+ - ee
54
+ - fo
55
+ - fj
56
+ - fi
57
+ - fon
58
+ - fr
59
+ - fur
60
+ - fuv
61
+ - gaz
62
+ - gd
63
+ - ga
64
+ - gl
65
+ - gn
66
+ - gu
67
+ - ht
68
+ - ha
69
+ - he
70
+ - hi
71
+ - hne
72
+ - hr
73
+ - hu
74
+ - hy
75
+ - ig
76
+ - ilo
77
+ - id
78
+ - is
79
+ - it
80
+ - jv
81
+ - ja
82
+ - kab
83
+ - kac
84
+ - kam
85
+ - kn
86
+ - ks
87
+ - ka
88
+ - kk
89
+ - kbp
90
+ - kea
91
+ - khk
92
+ - km
93
+ - ki
94
+ - rw
95
+ - ky
96
+ - kmb
97
+ - kmr
98
+ - knc
99
+ - kg
100
+ - ko
101
+ - lo
102
+ - lij
103
+ - li
104
+ - ln
105
+ - lt
106
+ - lmo
107
+ - ltg
108
+ - lb
109
+ - lua
110
+ - lg
111
+ - luo
112
+ - lus
113
+ - lvs
114
+ - mag
115
+ - mai
116
+ - ml
117
+ - mar
118
+ - min
119
+ - mk
120
+ - mt
121
+ - mni
122
+ - mos
123
+ - mi
124
+ - my
125
+ - nl
126
+ - nn
127
+ - nb
128
+ - npi
129
+ - nso
130
+ - nus
131
+ - ny
132
+ - oc
133
+ - ory
134
+ - pag
135
+ - pa
136
+ - pap
137
+ - pbt
138
+ - pes
139
+ - plt
140
+ - pl
141
+ - pt
142
+ - prs
143
+ - quy
144
+ - ro
145
+ - rn
146
+ - ru
147
+ - sg
148
+ - sa
149
+ - sat
150
+ - scn
151
+ - shn
152
+ - si
153
+ - sk
154
+ - sl
155
+ - sm
156
+ - sn
157
+ - sd
158
+ - so
159
+ - st
160
+ - es
161
+ - sc
162
+ - sr
163
+ - ss
164
+ - su
165
+ - sv
166
+ - swh
167
+ - szl
168
+ - ta
169
+ - taq
170
+ - tt
171
+ - te
172
+ - tg
173
+ - tl
174
+ - th
175
+ - ti
176
+ - tpi
177
+ - tn
178
+ - ts
179
+ - tk
180
+ - tum
181
+ - tr
182
+ - tw
183
+ - tzm
184
+ - ug
185
+ - uk
186
+ - umb
187
+ - ur
188
+ - uzn
189
+ - vec
190
+ - vi
191
+ - war
192
+ - wo
193
+ - xh
194
+ - ydd
195
+ - yo
196
+ - yue
197
+ - zh
198
+ - zsm
199
+ - zu
200
+ language_details: >-
201
+ ace_Arab, ace_Latn, acm_Arab, acq_Arab, aeb_Arab, afr_Latn, ajp_Arab,
202
+ aka_Latn, amh_Ethi, apc_Arab, arb_Arab, ars_Arab, ary_Arab, arz_Arab,
203
+ asm_Beng, ast_Latn, awa_Deva, ayr_Latn, azb_Arab, azj_Latn, bak_Cyrl,
204
+ bam_Latn, ban_Latn,bel_Cyrl, bem_Latn, ben_Beng, bho_Deva, bjn_Arab, bjn_Latn,
205
+ bod_Tibt, bos_Latn, bug_Latn, bul_Cyrl, cat_Latn, ceb_Latn, ces_Latn,
206
+ cjk_Latn, ckb_Arab, crh_Latn, cym_Latn, dan_Latn, deu_Latn, dik_Latn,
207
+ dyu_Latn, dzo_Tibt, ell_Grek, eng_Latn, epo_Latn, est_Latn, eus_Latn,
208
+ ewe_Latn, fao_Latn, pes_Arab, fij_Latn, fin_Latn, fon_Latn, fra_Latn,
209
+ fur_Latn, fuv_Latn, gla_Latn, gle_Latn, glg_Latn, grn_Latn, guj_Gujr,
210
+ hat_Latn, hau_Latn, heb_Hebr, hin_Deva, hne_Deva, hrv_Latn, hun_Latn,
211
+ hye_Armn, ibo_Latn, ilo_Latn, ind_Latn, isl_Latn, ita_Latn, jav_Latn,
212
+ jpn_Jpan, kab_Latn, kac_Latn, kam_Latn, kan_Knda, kas_Arab, kas_Deva,
213
+ kat_Geor, knc_Arab, knc_Latn, kaz_Cyrl, kbp_Latn, kea_Latn, khm_Khmr,
214
+ kik_Latn, kin_Latn, kir_Cyrl, kmb_Latn, kon_Latn, kor_Hang, kmr_Latn,
215
+ lao_Laoo, lvs_Latn, lij_Latn, lim_Latn, lin_Latn, lit_Latn, lmo_Latn,
216
+ ltg_Latn, ltz_Latn, lua_Latn, lug_Latn, luo_Latn, lus_Latn, mag_Deva,
217
+ mai_Deva, mal_Mlym, mar_Deva, min_Latn, mkd_Cyrl, plt_Latn, mlt_Latn,
218
+ mni_Beng, khk_Cyrl, mos_Latn, mri_Latn, zsm_Latn, mya_Mymr, nld_Latn,
219
+ nno_Latn, nob_Latn, npi_Deva, nso_Latn, nus_Latn, nya_Latn, oci_Latn,
220
+ gaz_Latn, ory_Orya, pag_Latn, pan_Guru, pap_Latn, pol_Latn, por_Latn,
221
+ prs_Arab, pbt_Arab, quy_Latn, ron_Latn, run_Latn, rus_Cyrl, sag_Latn,
222
+ san_Deva, sat_Beng, scn_Latn, shn_Mymr, sin_Sinh, slk_Latn, slv_Latn,
223
+ smo_Latn, sna_Latn, snd_Arab, som_Latn, sot_Latn, spa_Latn, als_Latn,
224
+ srd_Latn, srp_Cyrl, ssw_Latn, sun_Latn, swe_Latn, swh_Latn, szl_Latn,
225
+ tam_Taml, tat_Cyrl, tel_Telu, tgk_Cyrl, tgl_Latn, tha_Thai, tir_Ethi,
226
+ taq_Latn, taq_Tfng, tpi_Latn, tsn_Latn, tso_Latn, tuk_Latn, tum_Latn,
227
+ tur_Latn, twi_Latn, tzm_Tfng, uig_Arab, ukr_Cyrl, umb_Latn, urd_Arab,
228
+ uzn_Latn, vec_Latn, vie_Latn, war_Latn, wol_Latn, xho_Latn, ydd_Hebr,
229
+ yor_Latn, yue_Hant, zho_Hans, zho_Hant, zul_Latn
230
  ---
231
  This is a port of the multilingual text encoder from https://huggingface.co/facebook/SONAR to `transformers` format from `fairseq2`.
232
 
233
+ It supports the same 202 languages as [NLLB-200](https://huggingface.co/facebook/nllb-200-distilled-600M)
234
+ (see also [the source model card](https://github.com/facebookresearch/SONAR/blob/main/sonar/store/cards/text_sonar_basic_encoder.yaml#L14)
235
+ and [FLORES-200 lang code mapping](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)).
236
+
237
  For advanced examples of usage, please take a look at https://github.com/facebookresearch/SONAR.
238
 
239
  How to use:
240
  ```Python
241
+ # !pip install transformers sentencepiece -q
242
 
243
  import torch
244
  from transformers import AutoTokenizer