Mizuiro-sakura's picture
Upload 13 files
b4218cf
raw
history blame contribute delete
No virus
1.32 kB
#! /usr/bin/python3
from transformers import DebertaV2TokenizerFast
from transformers.models.bert_japanese.tokenization_bert_japanese import JumanppTokenizer
class JumanppPreTokenizer(JumanppTokenizer):
def jumanpp_split(self,i,normalized_string):
import textspan
t=str(normalized_string)
k=self.tokenize(t)
return [normalized_string[s:e] for c in textspan.get_original_spans(k,t) for s,e in c]
def pre_tokenize(self,pretok):
pretok.split(self.jumanpp_split)
class JumanppDebertaV2TokenizerFast(DebertaV2TokenizerFast):
def __init__(self,**kwargs):
from tokenizers.pre_tokenizers import PreTokenizer,Metaspace,Sequence
super().__init__(**kwargs)
self._tokenizer.pre_tokenizer=Sequence([PreTokenizer.custom(JumanppPreTokenizer()),Metaspace()])
def save_pretrained(self,save_directory,**kwargs):
import os
import shutil
from tokenizers.pre_tokenizers import PreTokenizer,Metaspace,Sequence
self._auto_map={"AutoTokenizer":[None,"tokenizer.JumanppDebertaV2TokenizerFast"]}
self._tokenizer.pre_tokenizer=Metaspace()
super().save_pretrained(save_directory,**kwargs)
self._tokenizer.pre_tokenizer=Sequence([PreTokenizer.custom(JumanppPreTokenizer()),Metaspace()])
shutil.copy(os.path.abspath(__file__),os.path.join(save_directory,"tokenizer.py"))