Commit
•
477a286
1
Parent(s):
d71546d
Changes for fast tokenizer (#20)
Browse files- Update tokens (4080f4e993d5ae739ccdf4359347a24ea63e5ca0)
- Add tokenizer.json (906330ca580f0d119131905cc24dc97534acc616)
Co-authored-by: Jonatan Kłosko <[email protected]>
- added_tokens.json +0 -1
- special_tokens_map.json +1 -1
- tokenizer.json +0 -0
- tokenizer_config.json +1 -1
- vocab.json +1 -0
added_tokens.json
CHANGED
@@ -17,7 +17,6 @@
|
|
17 |
"<|da|>": 50285,
|
18 |
"<|de|>": 50261,
|
19 |
"<|el|>": 50281,
|
20 |
-
"<|endoftext|>": 50257,
|
21 |
"<|en|>": 50259,
|
22 |
"<|es|>": 50262,
|
23 |
"<|et|>": 50307,
|
|
|
17 |
"<|da|>": 50285,
|
18 |
"<|de|>": 50261,
|
19 |
"<|el|>": 50281,
|
|
|
20 |
"<|en|>": 50259,
|
21 |
"<|es|>": 50262,
|
22 |
"<|et|>": 50307,
|
special_tokens_map.json
CHANGED
@@ -124,7 +124,7 @@
|
|
124 |
},
|
125 |
"pad_token": "<|endoftext|>",
|
126 |
"unk_token": {
|
127 |
-
"content": "",
|
128 |
"lstrip": false,
|
129 |
"normalized": true,
|
130 |
"rstrip": false,
|
|
|
124 |
},
|
125 |
"pad_token": "<|endoftext|>",
|
126 |
"unk_token": {
|
127 |
+
"content": "<|endoftext|>",
|
128 |
"lstrip": false,
|
129 |
"normalized": true,
|
130 |
"rstrip": false,
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -27,7 +27,7 @@
|
|
27 |
"tokenizer_class": "WhisperTokenizer",
|
28 |
"unk_token": {
|
29 |
"__type": "AddedToken",
|
30 |
-
"content": "",
|
31 |
"lstrip": false,
|
32 |
"normalized": true,
|
33 |
"rstrip": false,
|
|
|
27 |
"tokenizer_class": "WhisperTokenizer",
|
28 |
"unk_token": {
|
29 |
"__type": "AddedToken",
|
30 |
+
"content": "<|endoftext|>",
|
31 |
"lstrip": false,
|
32 |
"normalized": true,
|
33 |
"rstrip": false,
|
vocab.json
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
{
|
2 |
"": 50256,
|
|
|
3 |
"!": 0,
|
4 |
"!!": 1432,
|
5 |
"!!!": 4589,
|
|
|
1 |
{
|
2 |
"": 50256,
|
3 |
+
"<|endoftext|>": 50257,
|
4 |
"!": 0,
|
5 |
"!!": 1432,
|
6 |
"!!!": 4589,
|