felipovysk commited on
Commit
8249fe5
1 Parent(s): 45ea1be

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +4 -10
  2. tokenizer.json +11 -6
  3. tokenizer_config.json +6 -7
special_tokens_map.json CHANGED
@@ -2,28 +2,22 @@
2
  "bos_token": {
3
  "content": "<s>",
4
  "lstrip": false,
5
- "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
  "content": "</s>",
11
  "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "<pad>",
18
- "lstrip": false,
19
- "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
 
23
  "unk_token": {
24
  "content": "<unk>",
25
  "lstrip": false,
26
- "normalized": false,
27
  "rstrip": false,
28
  "single_word": false
29
  }
 
2
  "bos_token": {
3
  "content": "<s>",
4
  "lstrip": false,
5
+ "normalized": true,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
  "content": "</s>",
11
  "lstrip": false,
12
+ "normalized": true,
 
 
 
 
 
 
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": "</s>",
17
  "unk_token": {
18
  "content": "<unk>",
19
  "lstrip": false,
20
+ "normalized": true,
21
  "rstrip": false,
22
  "single_word": false
23
  }
tokenizer.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
@@ -9,7 +14,7 @@
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
- "normalized": false,
13
  "special": true
14
  },
15
  {
@@ -18,7 +23,7 @@
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
21
- "normalized": false,
22
  "special": true
23
  },
24
  {
@@ -27,7 +32,7 @@
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
- "normalized": false,
31
  "special": true
32
  },
33
  {
@@ -36,8 +41,8 @@
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
  }
42
  ],
43
  "normalizer": {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 128,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
14
  "single_word": false,
15
  "lstrip": false,
16
  "rstrip": false,
17
+ "normalized": true,
18
  "special": true
19
  },
20
  {
 
23
  "single_word": false,
24
  "lstrip": false,
25
  "rstrip": false,
26
+ "normalized": true,
27
  "special": true
28
  },
29
  {
 
32
  "single_word": false,
33
  "lstrip": false,
34
  "rstrip": false,
35
+ "normalized": true,
36
  "special": true
37
  },
38
  {
 
41
  "single_word": false,
42
  "lstrip": false,
43
  "rstrip": false,
44
+ "normalized": true,
45
+ "special": false
46
  }
47
  ],
48
  "normalizer": {
tokenizer_config.json CHANGED
@@ -6,7 +6,7 @@
6
  "0": {
7
  "content": "<unk>",
8
  "lstrip": false,
9
- "normalized": false,
10
  "rstrip": false,
11
  "single_word": false,
12
  "special": true
@@ -14,7 +14,7 @@
14
  "1": {
15
  "content": "<s>",
16
  "lstrip": false,
17
- "normalized": false,
18
  "rstrip": false,
19
  "single_word": false,
20
  "special": true
@@ -22,7 +22,7 @@
22
  "2": {
23
  "content": "</s>",
24
  "lstrip": false,
25
- "normalized": false,
26
  "rstrip": false,
27
  "single_word": false,
28
  "special": true
@@ -30,10 +30,10 @@
30
  "32000": {
31
  "content": "<pad>",
32
  "lstrip": false,
33
- "normalized": false,
34
  "rstrip": false,
35
  "single_word": false,
36
- "special": true
37
  }
38
  },
39
  "bos_token": "<s>",
@@ -41,8 +41,7 @@
41
  "eos_token": "</s>",
42
  "legacy": false,
43
  "model_max_length": 1000000000000000019884624838656,
44
- "pad_token": "<pad>",
45
- "padding_side": "right",
46
  "sp_model_kwargs": {},
47
  "tokenizer_class": "LlamaTokenizer",
48
  "unk_token": "<unk>",
 
6
  "0": {
7
  "content": "<unk>",
8
  "lstrip": false,
9
+ "normalized": true,
10
  "rstrip": false,
11
  "single_word": false,
12
  "special": true
 
14
  "1": {
15
  "content": "<s>",
16
  "lstrip": false,
17
+ "normalized": true,
18
  "rstrip": false,
19
  "single_word": false,
20
  "special": true
 
22
  "2": {
23
  "content": "</s>",
24
  "lstrip": false,
25
+ "normalized": true,
26
  "rstrip": false,
27
  "single_word": false,
28
  "special": true
 
30
  "32000": {
31
  "content": "<pad>",
32
  "lstrip": false,
33
+ "normalized": true,
34
  "rstrip": false,
35
  "single_word": false,
36
+ "special": false
37
  }
38
  },
39
  "bos_token": "<s>",
 
41
  "eos_token": "</s>",
42
  "legacy": false,
43
  "model_max_length": 1000000000000000019884624838656,
44
+ "pad_token": "</s>",
 
45
  "sp_model_kwargs": {},
46
  "tokenizer_class": "LlamaTokenizer",
47
  "unk_token": "<unk>",