Added missing normalizer to tokenizer
Browse files- tokenizer.json +7 -0
tokenizer.json
CHANGED
@@ -76,6 +76,13 @@
|
|
76 |
"String": "<foreign>"
|
77 |
},
|
78 |
"content": "UNK"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
}
|
80 |
]
|
81 |
},
|
|
|
76 |
"String": "<foreign>"
|
77 |
},
|
78 |
"content": "UNK"
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"type": "Replace",
|
82 |
+
"pattern": {
|
83 |
+
"Regex": "[^-\u0000--‿₠-₿∀-⋿⅐-↋ff-ﭏ]+"
|
84 |
+
},
|
85 |
+
"content": "UNK"
|
86 |
}
|
87 |
]
|
88 |
},
|