fix can't set attribute 'eos_token' when loading the saved tokenizer

#27
by hiyouga - opened
Files changed (1) hide show
  1. tokenization_chatglm.py +48 -20
tokenization_chatglm.py CHANGED
@@ -8,6 +8,9 @@ from transformers.utils import logging, PaddingStrategy
8
  from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
9
 
10
 
 
 
 
11
  class SPTokenizer:
12
  def __init__(self, model_path: str):
13
  # reload tokenizer
@@ -89,25 +92,34 @@ class SPTokenizer:
89
 
90
 
91
  class ChatGLMTokenizer(PreTrainedTokenizer):
92
- vocab_files_names = {"vocab_file": "tokenizer.model"}
93
 
 
94
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
95
 
96
- def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False,
97
- **kwargs):
 
 
 
 
 
 
98
  self.name = "GLMTokenizer"
99
-
100
  self.vocab_file = vocab_file
101
  self.tokenizer = SPTokenizer(vocab_file)
102
  self.special_tokens = {
103
  "<bos>": self.tokenizer.bos_id,
104
  "<eos>": self.tokenizer.eos_id,
 
105
  "<pad>": self.tokenizer.pad_id
106
  }
107
  self.encode_special_tokens = encode_special_tokens
108
- super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
109
- encode_special_tokens=encode_special_tokens,
110
- **kwargs)
 
 
 
111
 
112
  def get_command(self, token):
113
  if token in self.special_tokens:
@@ -117,24 +129,40 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
117
 
118
  @property
119
  def unk_token(self) -> str:
120
- return "<unk>"
121
 
122
  @property
123
  def pad_token(self) -> str:
124
- return "<unk>"
125
 
126
  @property
127
- def pad_token_id(self):
128
- return self.get_command("<pad>")
129
 
130
  @property
131
- def eos_token(self) -> str:
132
- return "</s>"
 
 
 
 
133
 
134
  @property
135
  def eos_token_id(self):
136
  return self.get_command("<eos>")
137
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  @property
139
  def vocab_size(self):
140
  return self.tokenizer.n_words
@@ -212,7 +240,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
212
  return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
213
 
214
  def build_inputs_with_special_tokens(
215
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
216
  ) -> List[int]:
217
  """
218
  Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
@@ -237,12 +265,12 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
237
  return token_ids_0
238
 
239
  def _pad(
240
- self,
241
- encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
242
- max_length: Optional[int] = None,
243
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
244
- pad_to_multiple_of: Optional[int] = None,
245
- return_attention_mask: Optional[bool] = None,
246
  ) -> dict:
247
  """
248
  Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
 
8
  from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
9
 
10
 
11
+ logger = logging.get_logger(__name__)
12
+
13
+
14
  class SPTokenizer:
15
  def __init__(self, model_path: str):
16
  # reload tokenizer
 
92
 
93
 
94
  class ChatGLMTokenizer(PreTrainedTokenizer):
 
95
 
96
+ vocab_files_names = {"vocab_file": "tokenizer.model"}
97
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
98
 
99
+ def __init__(
100
+ self,
101
+ vocab_file,
102
+ padding_side="left",
103
+ clean_up_tokenization_spaces=False,
104
+ encode_special_tokens=False,
105
+ **kwargs
106
+ ):
107
  self.name = "GLMTokenizer"
 
108
  self.vocab_file = vocab_file
109
  self.tokenizer = SPTokenizer(vocab_file)
110
  self.special_tokens = {
111
  "<bos>": self.tokenizer.bos_id,
112
  "<eos>": self.tokenizer.eos_id,
113
+ "<unk>": self.tokenizer.pad_id,
114
  "<pad>": self.tokenizer.pad_id
115
  }
116
  self.encode_special_tokens = encode_special_tokens
117
+
118
+ super().__init__(
119
+ padding_side=padding_side,
120
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
121
+ **kwargs
122
+ )
123
 
124
  def get_command(self, token):
125
  if token in self.special_tokens:
 
129
 
130
  @property
131
  def unk_token(self) -> str:
132
+ return self.tokenizer.sp_model.IdToPiece(self.get_command("<unk>"))
133
 
134
  @property
135
  def pad_token(self) -> str:
136
+ return self.tokenizer.sp_model.IdToPiece(self.get_command("<pad>"))
137
 
138
  @property
139
+ def eos_token(self) -> str:
140
+ return self.tokenizer.sp_model.IdToPiece(self.get_command("<eos>"))
141
 
142
  @property
143
+ def unk_token_id(self) -> int:
144
+ return self.get_command("<unk>")
145
+
146
+ @property
147
+ def pad_token_id(self) -> int:
148
+ return self.get_command("<pad>")
149
 
150
  @property
151
  def eos_token_id(self):
152
  return self.get_command("<eos>")
153
 
154
+ @unk_token.setter
155
+ def unk_token(self, value):
156
+ logger.warning("Setting unk_token is not supported, use the default one.")
157
+
158
+ @pad_token.setter
159
+ def pad_token(self, value):
160
+ logger.warning("Setting pad_token is not supported, use the default one.")
161
+
162
+ @eos_token.setter
163
+ def eos_token(self, value):
164
+ logger.warning("Setting eos_token is not supported, use the default one.")
165
+
166
  @property
167
  def vocab_size(self):
168
  return self.tokenizer.n_words
 
240
  return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
241
 
242
  def build_inputs_with_special_tokens(
243
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
244
  ) -> List[int]:
245
  """
246
  Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
 
265
  return token_ids_0
266
 
267
  def _pad(
268
+ self,
269
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
270
+ max_length: Optional[int] = None,
271
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
272
+ pad_to_multiple_of: Optional[int] = None,
273
+ return_attention_mask: Optional[bool] = None,
274
  ) -> dict:
275
  """
276
  Pad encoded inputs (on left/right and up to predefined length or max length in the batch)