SmerkyG commited on
Commit
49f029b
1 Parent(s): 4c69ede

change padding token to newline (id=11)

Browse files
Files changed (1) hide show
  1. tokenization_rwkv_world.py +5 -5
tokenization_rwkv_world.py CHANGED
@@ -93,7 +93,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
93
  vocab_files_names = VOCAB_FILES_NAMES
94
  model_input_names = ["input_ids", "attention_mask"]
95
 
96
- def __init__(self, vocab_file, errors="replace", pad_token="0", **kwargs):
97
  self.add_bos_token = False
98
  self.encoder = {}
99
  sorted = [] # must be already sorted
@@ -133,7 +133,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
133
 
134
  @property
135
  def pad_token_id(self) -> Optional[int]:
136
- return 0
137
 
138
  @property
139
  def vocab_size(self):
@@ -264,11 +264,11 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
264
 
265
  def _convert_token_to_id(self, token):
266
  """Converts a token (str) in an id using the vocab."""
267
- return self.encoder.get(token, self.encoder.get(self.unk_token))
268
 
269
  def _convert_id_to_token(self, index):
270
  """Converts an index (integer) in a token (str) using the vocab."""
271
- return self.decoder.get(index)
272
 
273
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
274
  if not os.path.exists(save_directory):
@@ -316,7 +316,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
316
  verbose: bool = True,
317
  **kwargs,
318
  ) -> BatchEncoding:
319
- def get_input_ids(text, max_length=None, pad_token_id=0):
320
  def pad_sequence(seq, max_len, pad_tok):
321
  return [pad_tok] * (max_len - len(seq)) + seq
322
 
 
93
  vocab_files_names = VOCAB_FILES_NAMES
94
  model_input_names = ["input_ids", "attention_mask"]
95
 
96
+ def __init__(self, vocab_file, errors="replace", pad_token="\n", **kwargs):
97
  self.add_bos_token = False
98
  self.encoder = {}
99
  sorted = [] # must be already sorted
 
133
 
134
  @property
135
  def pad_token_id(self) -> Optional[int]:
136
+ return 11
137
 
138
  @property
139
  def vocab_size(self):
 
264
 
265
  def _convert_token_to_id(self, token):
266
  """Converts a token (str) in an id using the vocab."""
267
+ return self.decoder.get(token.encode("utf-8"), self.unk_token_id)
268
 
269
  def _convert_id_to_token(self, index):
270
  """Converts an index (integer) in a token (str) using the vocab."""
271
+ return self.encoder.get(index)
272
 
273
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
274
  if not os.path.exists(save_directory):
 
316
  verbose: bool = True,
317
  **kwargs,
318
  ) -> BatchEncoding:
319
+ def get_input_ids(text, max_length=None, pad_token_id=11):
320
  def pad_sequence(seq, max_len, pad_tok):
321
  return [pad_tok] * (max_len - len(seq)) + seq
322