SaulLu commited on
Commit
774a92e
1 Parent(s): 5857595

add new fast version

Browse files
Files changed (1) hide show
  1. tokenizer.json +2 -5
tokenizer.json CHANGED
@@ -43,16 +43,13 @@
43
  "pre_tokenizer": {
44
  "type": "Sequence",
45
  "pretokenizers": [
46
- {
47
- "type": "WhitespaceSplit"
48
- },
49
  {
50
  "type": "Split",
51
  "pattern": {
52
  "Regex": "<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+"
53
  },
54
- "behavior": "MergedWithPrevious",
55
- "invert": false
56
  },
57
  {
58
  "type": "ByteLevel",
 
43
  "pre_tokenizer": {
44
  "type": "Sequence",
45
  "pretokenizers": [
 
 
 
46
  {
47
  "type": "Split",
48
  "pattern": {
49
  "Regex": "<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+"
50
  },
51
+ "behavior": "Removed",
52
+ "invert": true
53
  },
54
  {
55
  "type": "ByteLevel",