Upload tokenizer (#26)
Browse files- Upload tokenizer (33482313ea52a0bc9ee1303ac23d3f2d36a90932)
- added_tokens.json +0 -1
- merges.txt +1 -1
- special_tokens_map.json +1 -1
- tokenizer.json +0 -0
- tokenizer_config.json +1 -2
- vocab.json +0 -0
added_tokens.json
CHANGED
@@ -17,7 +17,6 @@
|
|
17 |
"<|da|>": 50285,
|
18 |
"<|de|>": 50261,
|
19 |
"<|el|>": 50281,
|
20 |
-
"<|endoftext|>": 50257,
|
21 |
"<|en|>": 50259,
|
22 |
"<|es|>": 50262,
|
23 |
"<|et|>": 50307,
|
|
|
17 |
"<|da|>": 50285,
|
18 |
"<|de|>": 50261,
|
19 |
"<|el|>": 50281,
|
|
|
20 |
"<|en|>": 50259,
|
21 |
"<|es|>": 50262,
|
22 |
"<|et|>": 50307,
|
merges.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
#version: 0.2
|
2 |
Ġ a
|
3 |
Ġt h
|
4 |
i n
|
|
|
1 |
+
#version: 0.2 - Trained by `huggingface/tokenizers`
|
2 |
Ġ a
|
3 |
Ġt h
|
4 |
i n
|
special_tokens_map.json
CHANGED
@@ -124,7 +124,7 @@
|
|
124 |
},
|
125 |
"pad_token": "<|endoftext|>",
|
126 |
"unk_token": {
|
127 |
-
"content": "",
|
128 |
"lstrip": false,
|
129 |
"normalized": true,
|
130 |
"rstrip": false,
|
|
|
124 |
},
|
125 |
"pad_token": "<|endoftext|>",
|
126 |
"unk_token": {
|
127 |
+
"content": "<|endoftext|>",
|
128 |
"lstrip": false,
|
129 |
"normalized": true,
|
130 |
"rstrip": false,
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -19,7 +19,6 @@
|
|
19 |
},
|
20 |
"errors": "replace",
|
21 |
"model_max_length": 1024,
|
22 |
-
"name_or_path": "openai/whisper-large",
|
23 |
"pad_token": null,
|
24 |
"processor_class": "WhisperProcessor",
|
25 |
"return_attention_mask": false,
|
@@ -27,7 +26,7 @@
|
|
27 |
"tokenizer_class": "WhisperTokenizer",
|
28 |
"unk_token": {
|
29 |
"__type": "AddedToken",
|
30 |
-
"content": "",
|
31 |
"lstrip": false,
|
32 |
"normalized": true,
|
33 |
"rstrip": false,
|
|
|
19 |
},
|
20 |
"errors": "replace",
|
21 |
"model_max_length": 1024,
|
|
|
22 |
"pad_token": null,
|
23 |
"processor_class": "WhisperProcessor",
|
24 |
"return_attention_mask": false,
|
|
|
26 |
"tokenizer_class": "WhisperTokenizer",
|
27 |
"unk_token": {
|
28 |
"__type": "AddedToken",
|
29 |
+
"content": "<|endoftext|>",
|
30 |
"lstrip": false,
|
31 |
"normalized": true,
|
32 |
"rstrip": false,
|
vocab.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|