change padding token to newline (id=11)
Browse files
tokenization_rwkv_world.py
CHANGED
@@ -93,7 +93,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
|
|
93 |
vocab_files_names = VOCAB_FILES_NAMES
|
94 |
model_input_names = ["input_ids", "attention_mask"]
|
95 |
|
96 |
-
def __init__(self, vocab_file, errors="replace", pad_token="
|
97 |
self.add_bos_token = False
|
98 |
self.encoder = {}
|
99 |
sorted = [] # must be already sorted
|
@@ -133,7 +133,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
|
|
133 |
|
134 |
@property
|
135 |
def pad_token_id(self) -> Optional[int]:
|
136 |
-
return
|
137 |
|
138 |
@property
|
139 |
def vocab_size(self):
|
@@ -264,11 +264,11 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
|
|
264 |
|
265 |
def _convert_token_to_id(self, token):
|
266 |
"""Converts a token (str) in an id using the vocab."""
|
267 |
-
return self.
|
268 |
|
269 |
def _convert_id_to_token(self, index):
|
270 |
"""Converts an index (integer) in a token (str) using the vocab."""
|
271 |
-
return self.
|
272 |
|
273 |
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
274 |
if not os.path.exists(save_directory):
|
@@ -316,7 +316,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
|
|
316 |
verbose: bool = True,
|
317 |
**kwargs,
|
318 |
) -> BatchEncoding:
|
319 |
-
def get_input_ids(text, max_length=None, pad_token_id=
|
320 |
def pad_sequence(seq, max_len, pad_tok):
|
321 |
return [pad_tok] * (max_len - len(seq)) + seq
|
322 |
|
|
|
93 |
vocab_files_names = VOCAB_FILES_NAMES
|
94 |
model_input_names = ["input_ids", "attention_mask"]
|
95 |
|
96 |
+
def __init__(self, vocab_file, errors="replace", pad_token="\n", **kwargs):
|
97 |
self.add_bos_token = False
|
98 |
self.encoder = {}
|
99 |
sorted = [] # must be already sorted
|
|
|
133 |
|
134 |
@property
|
135 |
def pad_token_id(self) -> Optional[int]:
|
136 |
+
return 11
|
137 |
|
138 |
@property
|
139 |
def vocab_size(self):
|
|
|
264 |
|
265 |
def _convert_token_to_id(self, token):
|
266 |
"""Converts a token (str) in an id using the vocab."""
|
267 |
+
return self.decoder.get(token.encode("utf-8"), self.unk_token_id)
|
268 |
|
269 |
def _convert_id_to_token(self, index):
|
270 |
"""Converts an index (integer) in a token (str) using the vocab."""
|
271 |
+
return self.encoder.get(index)
|
272 |
|
273 |
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
274 |
if not os.path.exists(save_directory):
|
|
|
316 |
verbose: bool = True,
|
317 |
**kwargs,
|
318 |
) -> BatchEncoding:
|
319 |
+
def get_input_ids(text, max_length=None, pad_token_id=11):
|
320 |
def pad_sequence(seq, max_len, pad_tok):
|
321 |
return [pad_tok] * (max_len - len(seq)) + seq
|
322 |
|