from transformers import PreTrainedTokenizer, AddedToken | |
class CustomTokenizer(PreTrainedTokenizer): | |
def __init__(self, vocab_file, **kwargs): | |
super().__init__(**kwargs) | |
print("Initializing CustomTokenizer") | |
def tokenize(self, text): | |
print("Tokenizing text", text) | |
return text.split() |