duzx16
commited on
Commit
•
71189e7
1
Parent(s):
eb3e683
Fix tokenization space
Browse files- tokenization_chatglm.py +5 -1
tokenization_chatglm.py
CHANGED
@@ -66,7 +66,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
66 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
67 |
|
68 |
def __init__(self, vocab_file, padding_side="left", **kwargs):
|
69 |
-
super().__init__(padding_side=padding_side, **kwargs)
|
70 |
self.name = "GLMTokenizer"
|
71 |
|
72 |
self.vocab_file = vocab_file
|
@@ -83,6 +83,10 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
83 |
assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
|
84 |
return self.tokenizer.special_tokens[token]
|
85 |
|
|
|
|
|
|
|
|
|
86 |
@property
|
87 |
def pad_token(self) -> str:
|
88 |
return "<unk>"
|
|
|
66 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
67 |
|
68 |
def __init__(self, vocab_file, padding_side="left", **kwargs):
|
69 |
+
super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=False, **kwargs)
|
70 |
self.name = "GLMTokenizer"
|
71 |
|
72 |
self.vocab_file = vocab_file
|
|
|
83 |
assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
|
84 |
return self.tokenizer.special_tokens[token]
|
85 |
|
86 |
+
@property
|
87 |
+
def unk_token(self) -> str:
|
88 |
+
return "<unk>"
|
89 |
+
|
90 |
@property
|
91 |
def pad_token(self) -> str:
|
92 |
return "<unk>"
|