#! /usr/bin/python3 from transformers import DebertaV2TokenizerFast from transformers.models.bert_japanese.tokenization_bert_japanese import JumanppTokenizer class JumanppPreTokenizer(JumanppTokenizer): def jumanpp_split(self,i,normalized_string): import textspan t=str(normalized_string) k=self.tokenize(t) return [normalized_string[s:e] for c in textspan.get_original_spans(k,t) for s,e in c] def pre_tokenize(self,pretok): pretok.split(self.jumanpp_split) class JumanppDebertaV2TokenizerFast(DebertaV2TokenizerFast): def __init__(self,**kwargs): from tokenizers.pre_tokenizers import PreTokenizer,Metaspace,Sequence super().__init__(**kwargs) self._tokenizer.pre_tokenizer=Sequence([PreTokenizer.custom(JumanppPreTokenizer()),Metaspace()]) def save_pretrained(self,save_directory,**kwargs): import os import shutil from tokenizers.pre_tokenizers import PreTokenizer,Metaspace,Sequence self._auto_map={"AutoTokenizer":[None,"tokenizer.JumanppDebertaV2TokenizerFast"]} self._tokenizer.pre_tokenizer=Metaspace() super().save_pretrained(save_directory,**kwargs) self._tokenizer.pre_tokenizer=Sequence([PreTokenizer.custom(JumanppPreTokenizer()),Metaspace()]) shutil.copy(os.path.abspath(__file__),os.path.join(save_directory,"tokenizer.py"))