Xenova HF staff commited on
Commit
2097975
1 Parent(s): 335f49a

Update tokenizer files

Browse files
Files changed (2) hide show
  1. tokenizer.json +10 -1
  2. tokenizer_config.json +1 -1
tokenizer.json CHANGED
@@ -34,6 +34,10 @@
34
  "normalizer": {
35
  "type": "Sequence",
36
  "normalizers": [
 
 
 
 
37
  {
38
  "type": "Replace",
39
  "pattern": {
@@ -85,6 +89,12 @@
85
  },
86
  {
87
  "type": "Fuse"
 
 
 
 
 
 
88
  }
89
  ]
90
  },
@@ -96,7 +106,6 @@
96
  "end_of_word_suffix": null,
97
  "fuse_unk": true,
98
  "byte_fallback": true,
99
- "ignore_merges": false,
100
  "vocab": {
101
  "<unk>": 0,
102
  "<s>": 1,
 
34
  "normalizer": {
35
  "type": "Sequence",
36
  "normalizers": [
37
+ {
38
+ "type": "Prepend",
39
+ "prepend": "▁"
40
+ },
41
  {
42
  "type": "Replace",
43
  "pattern": {
 
89
  },
90
  {
91
  "type": "Fuse"
92
+ },
93
+ {
94
+ "type": "Strip",
95
+ "content": " ",
96
+ "start": 1,
97
+ "stop": 0
98
  }
99
  ]
100
  },
 
106
  "end_of_word_suffix": null,
107
  "fuse_unk": true,
108
  "byte_fallback": true,
 
109
  "vocab": {
110
  "<unk>": 0,
111
  "<s>": 1,
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "add_bos_token": false,
3
  "add_eos_token": false,
4
- "add_prefix_space": false,
5
  "added_tokens_decoder": {
6
  "0": {
7
  "content": "<unk>",
 
1
  {
2
  "add_bos_token": false,
3
  "add_eos_token": false,
4
+ "add_prefix_space": true,
5
  "added_tokens_decoder": {
6
  "0": {
7
  "content": "<unk>",