Update tokenization_rwkv_world.py
Browse files
tokenization_rwkv_world.py
CHANGED
@@ -93,7 +93,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
|
|
93 |
vocab_files_names = VOCAB_FILES_NAMES
|
94 |
model_input_names = ["input_ids", "attention_mask"]
|
95 |
|
96 |
-
def __init__(self, vocab_file, errors="replace", pad_token="
|
97 |
self.add_bos_token = False
|
98 |
self.encoder = {}
|
99 |
sorted = [] # must be already sorted
|
@@ -133,7 +133,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
|
|
133 |
|
134 |
@property
|
135 |
def pad_token_id(self) -> Optional[int]:
|
136 |
-
return
|
137 |
|
138 |
@property
|
139 |
def vocab_size(self):
|
@@ -316,7 +316,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
|
|
316 |
verbose: bool = True,
|
317 |
**kwargs,
|
318 |
) -> BatchEncoding:
|
319 |
-
def get_input_ids(text, max_length=None, pad_token_id=
|
320 |
def pad_sequence(seq, max_len, pad_tok):
|
321 |
return [pad_tok] * (max_len - len(seq)) + seq
|
322 |
|
|
|
93 |
vocab_files_names = VOCAB_FILES_NAMES
|
94 |
model_input_names = ["input_ids", "attention_mask"]
|
95 |
|
96 |
+
def __init__(self, vocab_file, errors="replace", pad_token="0", **kwargs):
|
97 |
self.add_bos_token = False
|
98 |
self.encoder = {}
|
99 |
sorted = [] # must be already sorted
|
|
|
133 |
|
134 |
@property
|
135 |
def pad_token_id(self) -> Optional[int]:
|
136 |
+
return 0
|
137 |
|
138 |
@property
|
139 |
def vocab_size(self):
|
|
|
316 |
verbose: bool = True,
|
317 |
**kwargs,
|
318 |
) -> BatchEncoding:
|
319 |
+
def get_input_ids(text, max_length=None, pad_token_id=0):
|
320 |
def pad_sequence(seq, max_len, pad_tok):
|
321 |
return [pad_tok] * (max_len - len(seq)) + seq
|
322 |
|