khushi1234455687 commited on
Commit
bbffe78
1 Parent(s): 714338c

Upload tokenizer

Browse files
Files changed (5) hide show
  1. README.md +7 -7
  2. added_tokens.json +4 -0
  3. special_tokens_map.json +6 -0
  4. tokenizer_config.json +47 -0
  5. vocab.json +132 -0
README.md CHANGED
@@ -1,19 +1,19 @@
1
  ---
2
- library_name: transformers
3
- license: apache-2.0
4
  base_model: facebook/wav2vec2-large-xlsr-53
5
- tags:
6
- - generated_from_trainer
7
  datasets:
8
  - fleurs
 
 
9
  metrics:
10
  - wer
 
 
11
  model-index:
12
  - name: wav2vec2-large-xlsr-oria-v0
13
  results:
14
  - task:
15
- name: Automatic Speech Recognition
16
  type: automatic-speech-recognition
 
17
  dataset:
18
  name: fleurs
19
  type: fleurs
@@ -21,9 +21,9 @@ model-index:
21
  split: None
22
  args: or_in
23
  metrics:
24
- - name: Wer
25
- type: wer
26
  value: 0.4972150445018662
 
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
1
  ---
 
 
2
  base_model: facebook/wav2vec2-large-xlsr-53
 
 
3
  datasets:
4
  - fleurs
5
+ library_name: transformers
6
+ license: apache-2.0
7
  metrics:
8
  - wer
9
+ tags:
10
+ - generated_from_trainer
11
  model-index:
12
  - name: wav2vec2-large-xlsr-oria-v0
13
  results:
14
  - task:
 
15
  type: automatic-speech-recognition
16
+ name: Automatic Speech Recognition
17
  dataset:
18
  name: fleurs
19
  type: fleurs
 
21
  split: None
22
  args: or_in
23
  metrics:
24
+ - type: wer
 
25
  value: 0.4972150445018662
26
+ name: Wer
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 131,
3
+ "<s>": 130
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "128": {
4
+ "content": "[UNK]",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "129": {
12
+ "content": "[PAD]",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "130": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "131": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_lower_case": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "[PAD]",
42
+ "replace_word_delimiter_char": " ",
43
+ "target_lang": null,
44
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
45
+ "unk_token": "[UNK]",
46
+ "word_delimiter_token": "|"
47
+ }
vocab.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$": 121,
3
+ "'": 78,
4
+ "*": 66,
5
+ "+": 64,
6
+ "/": 75,
7
+ "0": 117,
8
+ "1": 16,
9
+ "2": 90,
10
+ "3": 73,
11
+ "4": 45,
12
+ "5": 86,
13
+ "6": 61,
14
+ "7": 31,
15
+ "8": 92,
16
+ "9": 77,
17
+ "[": 25,
18
+ "[PAD]": 129,
19
+ "[UNK]": 128,
20
+ "]": 79,
21
+ "a": 50,
22
+ "b": 116,
23
+ "c": 5,
24
+ "d": 113,
25
+ "e": 33,
26
+ "f": 69,
27
+ "g": 8,
28
+ "h": 15,
29
+ "i": 99,
30
+ "j": 37,
31
+ "k": 91,
32
+ "l": 94,
33
+ "m": 115,
34
+ "n": 82,
35
+ "o": 6,
36
+ "p": 59,
37
+ "q": 35,
38
+ "r": 60,
39
+ "s": 103,
40
+ "t": 55,
41
+ "u": 102,
42
+ "v": 57,
43
+ "w": 76,
44
+ "x": 67,
45
+ "y": 109,
46
+ "z": 24,
47
+ "|": 20,
48
+ "£": 1,
49
+ "¥": 0,
50
+ "°": 27,
51
+ "½": 84,
52
+ "¾": 87,
53
+ "õ": 2,
54
+ "।": 36,
55
+ "ଁ": 47,
56
+ "ଂ": 96,
57
+ "ଃ": 38,
58
+ "ଅ": 29,
59
+ "ଆ": 95,
60
+ "ଇ": 9,
61
+ "ଈ": 68,
62
+ "ଉ": 100,
63
+ "ଊ": 40,
64
+ "ଋ": 19,
65
+ "ଏ": 11,
66
+ "ଐ": 39,
67
+ "ଓ": 110,
68
+ "ଔ": 48,
69
+ "କ": 44,
70
+ "ଖ": 7,
71
+ "ଗ": 42,
72
+ "ଘ": 13,
73
+ "ଙ": 65,
74
+ "ଚ": 28,
75
+ "ଛ": 120,
76
+ "ଜ": 30,
77
+ "ଝ": 118,
78
+ "ଞ": 89,
79
+ "ଟ": 43,
80
+ "ଠ": 46,
81
+ "ଡ": 21,
82
+ "ଢ": 56,
83
+ "ଣ": 107,
84
+ "ତ": 51,
85
+ "ଥ": 97,
86
+ "ଦ": 105,
87
+ "ଧ": 123,
88
+ "ନ": 122,
89
+ "ପ": 93,
90
+ "ଫ": 12,
91
+ "ବ": 101,
92
+ "ଭ": 85,
93
+ "ମ": 81,
94
+ "ଯ": 32,
95
+ "ର": 34,
96
+ "ଲ": 49,
97
+ "ଳ": 114,
98
+ "ଵ": 58,
99
+ "ଶ": 14,
100
+ "ଷ": 23,
101
+ "ସ": 80,
102
+ "ହ": 41,
103
+ "଼": 10,
104
+ "ା": 71,
105
+ "ି": 52,
106
+ "ୀ": 125,
107
+ "ୁ": 119,
108
+ "ୂ": 74,
109
+ "ୃ": 104,
110
+ "ୄ": 124,
111
+ "େ": 88,
112
+ "ୈ": 3,
113
+ "ୋ": 72,
114
+ "ୌ": 4,
115
+ "୍": 112,
116
+ "ୗ": 54,
117
+ "ୟ": 63,
118
+ "୦": 17,
119
+ "୧": 83,
120
+ "୨": 111,
121
+ "୩": 70,
122
+ "୪": 53,
123
+ "୬": 106,
124
+ "୭": 26,
125
+ "ୱ": 18,
126
+ "​": 98,
127
+ "‌": 62,
128
+ "‍": 108,
129
+ "–": 22,
130
+ "—": 127,
131
+ "’": 126
132
+ }