custom-model / custom_tokenizer.py
charlesxsh's picture
add tokenizer
dc12aad
raw
history blame
326 Bytes
from transformers import PreTrainedTokenizer, AddedToken
class CustomTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, **kwargs):
super().__init__(**kwargs)
print("Initializing CustomTokenizer")
def tokenize(self, text):
print("Tokenizing text", text)
return text.split()