tianxie-sf
commited on
Commit
•
943f44c
1
Parent(s):
2aa9556
Update tokenization_xgen.py (#16)
Browse files- Update tokenization_xgen.py (f79aeb3407323d2df16be52abe61dbed426d58b9)
- tokenization_xgen.py +3 -3
tokenization_xgen.py
CHANGED
@@ -134,15 +134,15 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
134 |
):
|
135 |
pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
136 |
eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
|
|
|
|
137 |
super().__init__(
|
138 |
pad_token=pad_token_added,
|
139 |
eos_token=eos_token_added,
|
140 |
add_eos_token=add_eos_token,
|
141 |
add_special_tokens=add_special_tokens,
|
142 |
**kwargs,
|
143 |
-
)
|
144 |
-
self.add_eos_token = add_eos_token
|
145 |
-
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
|
146 |
|
147 |
@property
|
148 |
def vocab_size(self):
|
|
|
134 |
):
|
135 |
pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
136 |
eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
137 |
+
self.add_eos_token = add_eos_token
|
138 |
+
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
|
139 |
super().__init__(
|
140 |
pad_token=pad_token_added,
|
141 |
eos_token=eos_token_added,
|
142 |
add_eos_token=add_eos_token,
|
143 |
add_special_tokens=add_special_tokens,
|
144 |
**kwargs,
|
145 |
+
)
|
|
|
|
|
146 |
|
147 |
@property
|
148 |
def vocab_size(self):
|