danielhanchen commited on
Commit
f0508c5
1 Parent(s): 27bc3c1

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +3 -28
  2. tokenizer.json +11 -11
  3. tokenizer_config.json +3 -35
special_tokens_map.json CHANGED
@@ -1,30 +1,5 @@
1
  {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": true
8
- },
9
- "eos_token": {
10
- "content": "</s>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": true
15
- },
16
- "pad_token": {
17
- "content": "<unk>",
18
- "lstrip": false,
19
- "normalized": true,
20
- "rstrip": false,
21
- "single_word": true
22
- },
23
- "unk_token": {
24
- "content": "<unk>",
25
- "lstrip": false,
26
- "normalized": true,
27
- "rstrip": false,
28
- "single_word": true
29
- }
30
  }
 
1
  {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  }
tokenizer.json CHANGED
@@ -6,28 +6,28 @@
6
  {
7
  "id": 0,
8
  "content": "<unk>",
9
- "single_word": true,
10
  "lstrip": false,
11
  "rstrip": false,
12
- "normalized": true,
13
  "special": true
14
  },
15
  {
16
  "id": 1,
17
  "content": "<s>",
18
- "single_word": true,
19
  "lstrip": false,
20
  "rstrip": false,
21
- "normalized": true,
22
  "special": true
23
  },
24
  {
25
  "id": 2,
26
  "content": "</s>",
27
- "single_word": true,
28
  "lstrip": false,
29
  "rstrip": false,
30
- "normalized": true,
31
  "special": true
32
  }
33
  ],
@@ -10973,7 +10973,7 @@
10973
  "▁License": 10826,
10974
  "▁routine": 10827,
10975
  "ijing": 10828,
10976
- " —": 10829,
10977
  "fit": 10830,
10978
  "ть": 10831,
10979
  "▁limitations": 10832,
@@ -19519,7 +19519,7 @@
19519
  "▁Durham": 19372,
19520
  "▁booked": 19373,
19521
  "▁wounds": 19374,
19522
- ". ": 19375,
19523
  "▁Buddhist": 19376,
19524
  "▁motorcycle": 19377,
19525
  "▁Engineer": 19378,
@@ -32107,7 +32107,7 @@
32107
  "х": 31960,
32108
  "è": 31961,
32109
  "ц": 31962,
32110
- " ": 31963,
32111
  "ж": 31964,
32112
  "à": 31965,
32113
  "·": 31966,
@@ -56373,7 +56373,7 @@
56373
  "▁rout ine",
56374
  "ij ing",
56375
  "iji ng",
56376
- " —",
56377
  "fi t",
56378
  "f it",
56379
  "т ь",
@@ -73637,7 +73637,7 @@
73637
  "▁boo ked",
73638
  "▁w ounds",
73639
  "▁wound s",
73640
- ". ",
73641
  "▁Budd hist",
73642
  "▁Buddh ist",
73643
  "▁motor cycle",
 
6
  {
7
  "id": 0,
8
  "content": "<unk>",
9
+ "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
+ "normalized": false,
13
  "special": true
14
  },
15
  {
16
  "id": 1,
17
  "content": "<s>",
18
+ "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
21
+ "normalized": false,
22
  "special": true
23
  },
24
  {
25
  "id": 2,
26
  "content": "</s>",
27
+ "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
+ "normalized": false,
31
  "special": true
32
  }
33
  ],
 
10973
  "▁License": 10826,
10974
  "▁routine": 10827,
10975
  "ijing": 10828,
10976
+ " —": 10829,
10977
  "fit": 10830,
10978
  "ть": 10831,
10979
  "▁limitations": 10832,
 
19519
  "▁Durham": 19372,
19520
  "▁booked": 19373,
19521
  "▁wounds": 19374,
19522
+ ". ": 19375,
19523
  "▁Buddhist": 19376,
19524
  "▁motorcycle": 19377,
19525
  "▁Engineer": 19378,
 
32107
  "х": 31960,
32108
  "è": 31961,
32109
  "ц": 31962,
32110
+ " ": 31963,
32111
  "ж": 31964,
32112
  "à": 31965,
32113
  "·": 31966,
 
56373
  "▁rout ine",
56374
  "ij ing",
56375
  "iji ng",
56376
+ "  —",
56377
  "fi t",
56378
  "f it",
56379
  "т ь",
 
73637
  "▁boo ked",
73638
  "▁w ounds",
73639
  "▁wound s",
73640
+ ".  ",
73641
  "▁Budd hist",
73642
  "▁Buddh ist",
73643
  "▁motor cycle",
tokenizer_config.json CHANGED
@@ -1,40 +1,8 @@
1
  {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "<s>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": true
11
- },
12
  "clean_up_tokenization_spaces": false,
13
- "eos_token": {
14
- "__type": "AddedToken",
15
- "content": "</s>",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": true
20
- },
21
  "model_max_length": 1000000000000000019884624838656,
22
- "pad_token": {
23
- "__type": "AddedToken",
24
- "content": "<unk>",
25
- "lstrip": false,
26
- "normalized": true,
27
- "rstrip": false,
28
- "single_word": true
29
- },
30
- "sp_model_kwargs": {},
31
  "tokenizer_class": "LlamaTokenizer",
32
- "unk_token": {
33
- "__type": "AddedToken",
34
- "content": "<unk>",
35
- "lstrip": false,
36
- "normalized": true,
37
- "rstrip": false,
38
- "single_word": true
39
- }
40
  }
 
1
  {
2
+ "bos_token": "<s>",
 
 
 
 
 
 
 
 
 
3
  "clean_up_tokenization_spaces": false,
4
+ "eos_token": "</s>",
 
 
 
 
 
 
 
5
  "model_max_length": 1000000000000000019884624838656,
 
 
 
 
 
 
 
 
 
6
  "tokenizer_class": "LlamaTokenizer",
7
+ "unk_token": "<unk>"
 
 
 
 
 
 
 
8
  }