qbaro commited on
Commit
e21df87
1 Parent(s): 7f83413

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 110, "</s>": 111, "[PAD]": 112}
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "[PAD]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": "/root/.cache/huggingface/transformers/d60dd610960164f1b79222456db381022e8f93b2f92c21d276f4cd2cb7647737.9d6cd81ef646692fb1c169a880161ea1cb95f49694f220aced9b704b457e51dd", "tokenizer_file": null, "name_or_path": "nguyenvulebinh/wav2vec2-base-vietnamese-250h", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json CHANGED
@@ -1 +1 @@
1
- {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "g": 6, "h": 7, "i": 8, "k": 9, "l": 10, "m": 11, "n": 12, "o": 13, "p": 14, "q": 15, "r": 16, "s": 17, "t": 18, "u": 19, "v": 20, "x": 21, "y": 22, "à": 23, "á": 24, "ã": 25, "è": 26, "é": 27, "ê": 28, "ì": 29, "í": 30, "ò": 31, "ó": 32, "õ": 33, "ù": 34, "ú": 35, "ý": 36, "ă": 37, "đ": 38, "ĩ": 39, "ũ": 40, "ơ": 41, "ư": 42, "": 43, "": 44, "": 45, "": 46, "ẩ": 47, "": 48, "": 49, "": 50, "": 51, "": 52, "": 53, "": 54, "": 55, "": 56, "": 57, "ế": 58, "": 59, "": 60, "": 61, "": 62, "": 63, "": 64, "": 65, "": 66, "": 67, "": 68, "": 69, "": 70, "": 71, "": 72, "": 73, "": 74, "": 75, "": 76, "": 77, "": 78, "": 79, "": 80, "": 81, "": 82, "": 83, "": 84, "": 85, "": 86, "|": 0, "[UNK]": 87, "[PAD]": 88}
 
1
+ {"": 0, "6": 1, "": 2, "í": 3, "3": 4, "": 5, "ý": 6, "": 7, "": 8, "": 9, "õ": 10, "7": 11, "ê": 12, "": 13, "": 14, "v": 15, "": 16, "a": 17, "l": 18, "": 19, "q": 20, "": 21, "j": 22, "": 23, "à": 24, "": 25, "n": 26, "é": 27, "": 28, "у": 29, "ô": 30, "u": 31, "y": 32, "": 33, "4": 34, "w": 35, "b": 36, "": 37, "": 38, "s": 39, "ì": 40, "": 41, "": 42, "8": 43, "d": 44, "": 45, "r": 47, "ũ": 48, "c": 49, "": 50, "9": 51, "ế": 52, "ù": 53, "": 54, "2": 55, "t": 56, "i": 57, "g": 58, "́": 59, "": 60, "̀": 61, "á": 62, "0": 63, "": 64, "e": 65, "": 66, "m": 67, "": 68, "": 69, "ĩ": 70, "h": 71, "â": 72, "ú": 73, "": 74, "": 75, "": 76, "f": 77, "": 78, "": 79, "": 80, "x": 81, "ó": 82, "ã": 83, "": 84, "": 85, "̣": 86, "z": 87, "ả": 88, "đ": 89, "è": 90, "ừ": 91, "ò": 92, "ẵ": 93, "1": 94, "ơ": 95, "k": 96, "ẫ": 97, "p": 98, "ấ": 99, "ẽ": 100, "ỉ": 101, "ớ": 102, "ẹ": 103, "ă": 104, "o": 105, "ư": 106, "5": 107, "|": 46, "<unk>": 108, "<pad>": 109}