Aratako commited on
Commit
c6567bf
1 Parent(s): f5de572

Fix tokenizer issue

Browse files
Files changed (1) hide show
  1. tokenizer.json +3 -207
tokenizer.json CHANGED
@@ -119,218 +119,14 @@
119
  "rstrip": false,
120
  "normalized": false,
121
  "special": true
122
- },
123
- {
124
- "id": 65001,
125
- "content": "<extra_id_0>",
126
- "single_word": false,
127
- "lstrip": false,
128
- "rstrip": false,
129
- "normalized": false,
130
- "special": true
131
- },
132
- {
133
- "id": 65002,
134
- "content": "<extra_id_1>",
135
- "single_word": false,
136
- "lstrip": false,
137
- "rstrip": false,
138
- "normalized": false,
139
- "special": true
140
- },
141
- {
142
- "id": 65003,
143
- "content": "<extra_id_2>",
144
- "single_word": false,
145
- "lstrip": false,
146
- "rstrip": false,
147
- "normalized": false,
148
- "special": true
149
- },
150
- {
151
- "id": 65004,
152
- "content": "<extra_id_3>",
153
- "single_word": false,
154
- "lstrip": false,
155
- "rstrip": false,
156
- "normalized": false,
157
- "special": true
158
- },
159
- {
160
- "id": 65005,
161
- "content": "<extra_id_4>",
162
- "single_word": false,
163
- "lstrip": false,
164
- "rstrip": false,
165
- "normalized": false,
166
- "special": true
167
- },
168
- {
169
- "id": 65006,
170
- "content": "<extra_id_5>",
171
- "single_word": false,
172
- "lstrip": false,
173
- "rstrip": false,
174
- "normalized": false,
175
- "special": true
176
- },
177
- {
178
- "id": 65007,
179
- "content": "<extra_id_6>",
180
- "single_word": false,
181
- "lstrip": false,
182
- "rstrip": false,
183
- "normalized": false,
184
- "special": true
185
- },
186
- {
187
- "id": 65008,
188
- "content": "<extra_id_7>",
189
- "single_word": false,
190
- "lstrip": false,
191
- "rstrip": false,
192
- "normalized": false,
193
- "special": true
194
- },
195
- {
196
- "id": 65009,
197
- "content": "<extra_id_8>",
198
- "single_word": false,
199
- "lstrip": false,
200
- "rstrip": false,
201
- "normalized": false,
202
- "special": true
203
- },
204
- {
205
- "id": 65010,
206
- "content": "<extra_id_9>",
207
- "single_word": false,
208
- "lstrip": false,
209
- "rstrip": false,
210
- "normalized": false,
211
- "special": true
212
- },
213
- {
214
- "id": 65011,
215
- "content": "<extra_id_10>",
216
- "single_word": false,
217
- "lstrip": false,
218
- "rstrip": false,
219
- "normalized": false,
220
- "special": true
221
- },
222
- {
223
- "id": 65012,
224
- "content": "<extra_id_11>",
225
- "single_word": false,
226
- "lstrip": false,
227
- "rstrip": false,
228
- "normalized": false,
229
- "special": true
230
- },
231
- {
232
- "id": 65013,
233
- "content": "<extra_id_12>",
234
- "single_word": false,
235
- "lstrip": false,
236
- "rstrip": false,
237
- "normalized": false,
238
- "special": true
239
- },
240
- {
241
- "id": 65014,
242
- "content": "<extra_id_13>",
243
- "single_word": false,
244
- "lstrip": false,
245
- "rstrip": false,
246
- "normalized": false,
247
- "special": true
248
- },
249
- {
250
- "id": 65015,
251
- "content": "<extra_id_14>",
252
- "single_word": false,
253
- "lstrip": false,
254
- "rstrip": false,
255
- "normalized": false,
256
- "special": true
257
- },
258
- {
259
- "id": 65016,
260
- "content": "<extra_id_15>",
261
- "single_word": false,
262
- "lstrip": false,
263
- "rstrip": false,
264
- "normalized": false,
265
- "special": true
266
- },
267
- {
268
- "id": 65017,
269
- "content": "<extra_id_16>",
270
- "single_word": false,
271
- "lstrip": false,
272
- "rstrip": false,
273
- "normalized": false,
274
- "special": true
275
- },
276
- {
277
- "id": 65018,
278
- "content": "<extra_id_17>",
279
- "single_word": false,
280
- "lstrip": false,
281
- "rstrip": false,
282
- "normalized": false,
283
- "special": true
284
- },
285
- {
286
- "id": 65019,
287
- "content": "<extra_id_18>",
288
- "single_word": false,
289
- "lstrip": false,
290
- "rstrip": false,
291
- "normalized": false,
292
- "special": true
293
- },
294
- {
295
- "id": 65020,
296
- "content": "<extra_id_19>",
297
- "single_word": false,
298
- "lstrip": false,
299
- "rstrip": false,
300
- "normalized": false,
301
- "special": true
302
- },
303
- {
304
- "id": 65021,
305
- "content": "<extra_id_20>",
306
- "single_word": false,
307
- "lstrip": false,
308
- "rstrip": false,
309
- "normalized": false,
310
- "special": true
311
- },
312
- {
313
- "id": 65022,
314
- "content": "<extra_id_21>",
315
- "single_word": false,
316
- "lstrip": false,
317
- "rstrip": false,
318
- "normalized": false,
319
- "special": true
320
- },
321
- {
322
- "id": 65023,
323
- "content": "<extra_id_22>",
324
- "single_word": false,
325
- "lstrip": false,
326
- "rstrip": false,
327
- "normalized": false,
328
- "special": true
329
  }
330
  ],
331
  "normalizer": {
332
  "type": "Sequence",
333
  "normalizers": [
 
 
 
334
  {
335
  "type": "Replace",
336
  "pattern": {
 
119
  "rstrip": false,
120
  "normalized": false,
121
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  }
123
  ],
124
  "normalizer": {
125
  "type": "Sequence",
126
  "normalizers": [
127
+ {
128
+ "type": "NFKC"
129
+ },
130
  {
131
  "type": "Replace",
132
  "pattern": {