diff --git "a/tokenizer.json" "b/tokenizer.json" new file mode 100644--- /dev/null +++ "b/tokenizer.json" @@ -0,0 +1,100365 @@ +{ + "version": "1.0", + "truncation": { + "direction": "Right", + "max_length": 256, + "strategy": "LongestFirst", + "stride": 0 + }, + "padding": { + "strategy": "BatchLongest", + "direction": "Right", + "pad_to_multiple_of": null, + "pad_id": 1, + "pad_type_id": 0, + "pad_token": "" + }, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": true, + "special": true + }, + { + "id": 4, + "content": "", + "single_word": false, + "lstrip": true, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": { + "type": "RobertaProcessing", + "sep": [ + "", + 2 + ], + "cls": [ + "", + 0 + ], + "trim_offsets": true, + "add_prefix_space": false + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": "", + "end_of_word_suffix": "", + "fuse_unk": false, + "byte_fallback": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "": 5, + "": 6, + "": 7, + "