anhnv125 commited on
Commit
e49ed0b
1 Parent(s): 8314657

Upload tokenizer

Browse files
Files changed (6) hide show
  1. added_tokens.json +149 -1
  2. merges.txt +0 -0
  3. special_tokens_map.json +11 -17
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +12 -14
  6. vocab.json +0 -0
added_tokens.json CHANGED
@@ -1,3 +1,151 @@
1
  {
2
- "[PAD]": 32000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  }
 
1
  {
2
+ "</s>": 50405,
3
+ "<</SYS>>": 50403,
4
+ "<<SYS>>": 50402,
5
+ "<s>": 50404,
6
+ "<|extratoken_100|>": 50356,
7
+ "<|extratoken_101|>": 50357,
8
+ "<|extratoken_102|>": 50358,
9
+ "<|extratoken_103|>": 50359,
10
+ "<|extratoken_104|>": 50360,
11
+ "<|extratoken_105|>": 50361,
12
+ "<|extratoken_106|>": 50362,
13
+ "<|extratoken_107|>": 50363,
14
+ "<|extratoken_108|>": 50364,
15
+ "<|extratoken_109|>": 50365,
16
+ "<|extratoken_10|>": 50266,
17
+ "<|extratoken_110|>": 50366,
18
+ "<|extratoken_111|>": 50367,
19
+ "<|extratoken_112|>": 50368,
20
+ "<|extratoken_113|>": 50369,
21
+ "<|extratoken_114|>": 50370,
22
+ "<|extratoken_115|>": 50371,
23
+ "<|extratoken_116|>": 50372,
24
+ "<|extratoken_117|>": 50373,
25
+ "<|extratoken_118|>": 50374,
26
+ "<|extratoken_119|>": 50375,
27
+ "<|extratoken_11|>": 50267,
28
+ "<|extratoken_120|>": 50376,
29
+ "<|extratoken_121|>": 50377,
30
+ "<|extratoken_122|>": 50378,
31
+ "<|extratoken_123|>": 50379,
32
+ "<|extratoken_124|>": 50380,
33
+ "<|extratoken_125|>": 50381,
34
+ "<|extratoken_126|>": 50382,
35
+ "<|extratoken_127|>": 50383,
36
+ "<|extratoken_128|>": 50384,
37
+ "<|extratoken_129|>": 50385,
38
+ "<|extratoken_12|>": 50268,
39
+ "<|extratoken_130|>": 50386,
40
+ "<|extratoken_131|>": 50387,
41
+ "<|extratoken_132|>": 50388,
42
+ "<|extratoken_133|>": 50389,
43
+ "<|extratoken_134|>": 50390,
44
+ "<|extratoken_135|>": 50391,
45
+ "<|extratoken_136|>": 50392,
46
+ "<|extratoken_137|>": 50393,
47
+ "<|extratoken_138|>": 50394,
48
+ "<|extratoken_139|>": 50395,
49
+ "<|extratoken_13|>": 50269,
50
+ "<|extratoken_140|>": 50396,
51
+ "<|extratoken_141|>": 50397,
52
+ "<|extratoken_142|>": 50398,
53
+ "<|extratoken_143|>": 50399,
54
+ "<|extratoken_14|>": 50270,
55
+ "<|extratoken_15|>": 50271,
56
+ "<|extratoken_16|>": 50272,
57
+ "<|extratoken_17|>": 50273,
58
+ "<|extratoken_18|>": 50274,
59
+ "<|extratoken_19|>": 50275,
60
+ "<|extratoken_1|>": 50257,
61
+ "<|extratoken_20|>": 50276,
62
+ "<|extratoken_21|>": 50277,
63
+ "<|extratoken_22|>": 50278,
64
+ "<|extratoken_23|>": 50279,
65
+ "<|extratoken_24|>": 50280,
66
+ "<|extratoken_25|>": 50281,
67
+ "<|extratoken_26|>": 50282,
68
+ "<|extratoken_27|>": 50283,
69
+ "<|extratoken_28|>": 50284,
70
+ "<|extratoken_29|>": 50285,
71
+ "<|extratoken_2|>": 50258,
72
+ "<|extratoken_30|>": 50286,
73
+ "<|extratoken_31|>": 50287,
74
+ "<|extratoken_32|>": 50288,
75
+ "<|extratoken_33|>": 50289,
76
+ "<|extratoken_34|>": 50290,
77
+ "<|extratoken_35|>": 50291,
78
+ "<|extratoken_36|>": 50292,
79
+ "<|extratoken_37|>": 50293,
80
+ "<|extratoken_38|>": 50294,
81
+ "<|extratoken_39|>": 50295,
82
+ "<|extratoken_3|>": 50259,
83
+ "<|extratoken_40|>": 50296,
84
+ "<|extratoken_41|>": 50297,
85
+ "<|extratoken_42|>": 50298,
86
+ "<|extratoken_43|>": 50299,
87
+ "<|extratoken_44|>": 50300,
88
+ "<|extratoken_45|>": 50301,
89
+ "<|extratoken_46|>": 50302,
90
+ "<|extratoken_47|>": 50303,
91
+ "<|extratoken_48|>": 50304,
92
+ "<|extratoken_49|>": 50305,
93
+ "<|extratoken_4|>": 50260,
94
+ "<|extratoken_50|>": 50306,
95
+ "<|extratoken_51|>": 50307,
96
+ "<|extratoken_52|>": 50308,
97
+ "<|extratoken_53|>": 50309,
98
+ "<|extratoken_54|>": 50310,
99
+ "<|extratoken_55|>": 50311,
100
+ "<|extratoken_56|>": 50312,
101
+ "<|extratoken_57|>": 50313,
102
+ "<|extratoken_58|>": 50314,
103
+ "<|extratoken_59|>": 50315,
104
+ "<|extratoken_5|>": 50261,
105
+ "<|extratoken_60|>": 50316,
106
+ "<|extratoken_61|>": 50317,
107
+ "<|extratoken_62|>": 50318,
108
+ "<|extratoken_63|>": 50319,
109
+ "<|extratoken_64|>": 50320,
110
+ "<|extratoken_65|>": 50321,
111
+ "<|extratoken_66|>": 50322,
112
+ "<|extratoken_67|>": 50323,
113
+ "<|extratoken_68|>": 50324,
114
+ "<|extratoken_69|>": 50325,
115
+ "<|extratoken_6|>": 50262,
116
+ "<|extratoken_70|>": 50326,
117
+ "<|extratoken_71|>": 50327,
118
+ "<|extratoken_72|>": 50328,
119
+ "<|extratoken_73|>": 50329,
120
+ "<|extratoken_74|>": 50330,
121
+ "<|extratoken_75|>": 50331,
122
+ "<|extratoken_76|>": 50332,
123
+ "<|extratoken_77|>": 50333,
124
+ "<|extratoken_78|>": 50334,
125
+ "<|extratoken_79|>": 50335,
126
+ "<|extratoken_7|>": 50263,
127
+ "<|extratoken_80|>": 50336,
128
+ "<|extratoken_81|>": 50337,
129
+ "<|extratoken_82|>": 50338,
130
+ "<|extratoken_83|>": 50339,
131
+ "<|extratoken_84|>": 50340,
132
+ "<|extratoken_85|>": 50341,
133
+ "<|extratoken_86|>": 50342,
134
+ "<|extratoken_87|>": 50343,
135
+ "<|extratoken_88|>": 50344,
136
+ "<|extratoken_89|>": 50345,
137
+ "<|extratoken_8|>": 50264,
138
+ "<|extratoken_90|>": 50346,
139
+ "<|extratoken_91|>": 50347,
140
+ "<|extratoken_92|>": 50348,
141
+ "<|extratoken_93|>": 50349,
142
+ "<|extratoken_94|>": 50350,
143
+ "<|extratoken_95|>": 50351,
144
+ "<|extratoken_96|>": 50352,
145
+ "<|extratoken_97|>": 50353,
146
+ "<|extratoken_98|>": 50354,
147
+ "<|extratoken_99|>": 50355,
148
+ "<|extratoken_9|>": 50265,
149
+ "[/INST]": 50400,
150
+ "[INST]": 50401
151
  }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,23 +1,17 @@
1
  {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "</s>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "[PAD]",
17
  "unk_token": {
18
- "content": "<unk>",
19
  "lstrip": false,
20
- "normalized": false,
21
  "rstrip": false,
22
  "single_word": false
23
  }
 
1
  {
2
+ "additional_special_tokens": [
3
+ "[/INST]",
4
+ "[INST]",
5
+ "<<SYS>>",
6
+ "<</SYS>>"
7
+ ],
8
+ "bos_token": "<s>",
9
+ "eos_token": "</s>",
10
+ "pad_token": "<|endoftext|>",
 
 
 
 
 
 
11
  "unk_token": {
12
+ "content": "<|endoftext|>",
13
  "lstrip": false,
14
+ "normalized": true,
15
  "rstrip": false,
16
  "single_word": false
17
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,33 +1,31 @@
1
  {
 
2
  "bos_token": {
3
  "__type": "AddedToken",
4
- "content": "<s>",
5
  "lstrip": false,
6
- "normalized": false,
7
  "rstrip": false,
8
  "single_word": false
9
  },
10
- "clean_up_tokenization_spaces": false,
11
  "eos_token": {
12
  "__type": "AddedToken",
13
- "content": "</s>",
14
  "lstrip": false,
15
- "normalized": false,
16
  "rstrip": false,
17
  "single_word": false
18
  },
19
- "legacy": false,
20
- "model_max_length": 1000000000000000019884624838656,
21
- "pad_token": null,
22
- "sp_model_kwargs": {},
23
- "tokenizer_class": "LlamaTokenizer",
24
  "unk_token": {
25
  "__type": "AddedToken",
26
- "content": "<unk>",
27
  "lstrip": false,
28
- "normalized": false,
29
  "rstrip": false,
30
  "single_word": false
31
- },
32
- "use_fast": false
33
  }
 
1
  {
2
+ "add_prefix_space": false,
3
  "bos_token": {
4
  "__type": "AddedToken",
5
+ "content": "<|endoftext|>",
6
  "lstrip": false,
7
+ "normalized": true,
8
  "rstrip": false,
9
  "single_word": false
10
  },
11
+ "clean_up_tokenization_spaces": true,
12
  "eos_token": {
13
  "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
  "lstrip": false,
16
+ "normalized": true,
17
  "rstrip": false,
18
  "single_word": false
19
  },
20
+ "errors": "replace",
21
+ "model_max_length": 1024,
22
+ "tokenizer_class": "GPT2Tokenizer",
 
 
23
  "unk_token": {
24
  "__type": "AddedToken",
25
+ "content": "<|endoftext|>",
26
  "lstrip": false,
27
+ "normalized": true,
28
  "rstrip": false,
29
  "single_word": false
30
+ }
 
31
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff