tokenizer-hi-manual-char / tokenizer_config.json
3v324v23's picture
feat: initial commit
8cd8778
{
"cls_token": "[CLS]",
"do_basic_tokenize": true,
"do_lower_case": false,
"mask_token": "[MASK]",
"model_max_length": 128,
"name_or_path": "KoichiYasuoka/roberta-small-japanese-aozora-char",
"never_split": [
"[PAD]",
"[UNK]",
"[CLS]",
"[SEP]",
"[MASK]"
],
"pad_token": "[PAD]",
"sep_token": "[SEP]",
"special_tokens_map_file": "/root/.cache/huggingface/hub/models--KoichiYasuoka--roberta-small-japanese-aozora-char/snapshots/dbbd6a003dc65a1876898e3667121ab48265cc94/special_tokens_map.json",
"strip_accents": false,
"tokenize_chinese_chars": true,
"tokenizer_class": "BertTokenizer",
"unk_token": "[UNK]"
}