shacharm commited on
Commit
8c13b56
1 Parent(s): 03adfea

add tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +1 -1
  2. tokenizer_config.json +1 -1
  3. vocab.json +1 -1
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json CHANGED
@@ -1 +1 @@
1
- {"а": 1, "б": 2, "в": 3, "г": 4, "д": 5, "е": 6, "ж": 7, "з": 8, "и": 9, "к": 10, "л": 11, "м": 12, "н": 13, "о": 14, "п": 15, "р": 16, "с": 17, "т": 18, "у": 19, "ф": 20, "х": 21, "ц": 22, "ш": 23, "ы": 24, "ь": 25, "қ": 26, "ҟ": 27, "ҩ": 28, "ҭ": 29, "ҳ": 30, "ҵ": 31, "ҽ": 32, "ҿ": 33, "ә": 34, "ӡ": 35, "ӷ": 36, "ԥ": 37, "|": 0, "[UNK]": 38, "[PAD]": 39}
 
1
+ {"&": 1, "(": 2, ")": 3, "+": 4, "/": 5, "=": 6, "[": 7, "]": 8, "_": 9, "`": 10, "a": 11, "b": 12, "c": 13, "d": 14, "e": 15, "f": 16, "g": 17, "h": 18, "i": 19, "j": 20, "k": 21, "l": 22, "m": 23, "n": 24, "o": 25, "p": 26, "q": 27, "r": 28, "s": 29, "t": 30, "u": 31, "v": 32, "w": 33, "x": 34, "y": 35, "z": 36, "¡": 37, "ß": 38, "à": 39, "á": 40, "ã": 41, "ä": 42, "å": 43, "æ": 44, "ç": 45, "è": 46, "é": 47, "ê": 48, "ë": 49, "í": 50, "ð": 51, "ñ": 52, "ó": 53, "ö": 54, "ø": 55, "ú": 56, "ü": 57, "þ": 58, "ā": 59, "ă": 60, "ć": 61, "č": 62, "ę": 63, "ě": 64, "ğ": 65, "ī": 66, "ł": 67, "ń": 68, "ō": 69, "ő": 70, "œ": 71, "ř": 72, "š": 73, "ū": 74, "ž": 75, "ʻ": 76, "α": 77, "π": 78, "χ": 79, "в": 80, "е": 81, "з": 82, "и": 83, "й": 84, "к": 85, "л": 86, "н": 87, "ь": 88, "я": 89, "ṃ": 90, "ạ": 91, "ụ": 92, "–": 93, "—": 94, "’": 95, "…": 96, "→": 97, "≡": 98, "京": 99, "大": 100, "都": 101, "阪": 102, "fl": 103, "|": 0, "[UNK]": 104, "[PAD]": 105}