add new fast version
Browse files- tokenizer.json +2 -5
tokenizer.json
CHANGED
@@ -43,16 +43,13 @@
|
|
43 |
"pre_tokenizer": {
|
44 |
"type": "Sequence",
|
45 |
"pretokenizers": [
|
46 |
-
{
|
47 |
-
"type": "WhitespaceSplit"
|
48 |
-
},
|
49 |
{
|
50 |
"type": "Split",
|
51 |
"pattern": {
|
52 |
"Regex": "<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+"
|
53 |
},
|
54 |
-
"behavior": "
|
55 |
-
"invert":
|
56 |
},
|
57 |
{
|
58 |
"type": "ByteLevel",
|
|
|
43 |
"pre_tokenizer": {
|
44 |
"type": "Sequence",
|
45 |
"pretokenizers": [
|
|
|
|
|
|
|
46 |
{
|
47 |
"type": "Split",
|
48 |
"pattern": {
|
49 |
"Regex": "<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+"
|
50 |
},
|
51 |
+
"behavior": "Removed",
|
52 |
+
"invert": true
|
53 |
},
|
54 |
{
|
55 |
"type": "ByteLevel",
|