diff --git "a/tokenizer.json" "b/tokenizer.json" new file mode 100644--- /dev/null +++ "b/tokenizer.json" @@ -0,0 +1,103830 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 4, + "content": "", + "single_word": false, + "lstrip": true, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": { + "type": "RobertaProcessing", + "sep": [ + "", + 2 + ], + "cls": [ + "", + 0 + ], + "trim_offsets": true, + "add_prefix_space": false + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": "", + "end_of_word_suffix": "", + "fuse_unk": false, + "byte_fallback": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "!": 5, + "\"": 6, + "#": 7, + "$": 8, + "%": 9, + "&": 10, + "'": 11, + "(": 12, + ")": 13, + "*": 14, + "+": 15, + ",": 16, + "-": 17, + ".": 18, + "/": 19, + "0": 20, + "1": 21, + "2": 22, + "3": 23, + "4": 24, + "5": 25, + "6": 26, + "7": 27, + "8": 28, + "9": 29, + ":": 30, + ";": 31, + "<": 32, + "=": 33, + ">": 34, + "?": 35, + "@": 36, + "A": 37, + "B": 38, + "C": 39, + "D": 40, + "E": 41, + "F": 42, + "G": 43, + "H": 44, + "I": 45, + "J": 46, + "K": 47, + "L": 48, + "M": 49, + "N": 50, + "O": 51, + "P": 52, + "Q": 53, + "R": 54, + "S": 55, + "T": 56, + "U": 57, + "V": 58, + "W": 59, + "X": 60, + "Y": 61, + "Z": 62, + "[": 63, + "\\": 64, + "]": 65, + "^": 66, + "_": 67, + "`": 68, + "a": 69, + "b": 70, + "c": 71, + "d": 72, + "e": 73, + "f": 74, + "g": 75, + "h": 76, + "i": 77, + "j": 78, + "k": 79, + "l": 80, + "m": 81, + "n": 82, + "o": 83, + "p": 84, + "q": 85, + "r": 86, + "s": 87, + "t": 88, + "u": 89, + "v": 90, + "w": 91, + "x": 92, + "y": 93, + "z": 94, + "{": 95, + "|": 96, + "}": 97, + "~": 98, + "¡": 99, + "¢": 100, + "£": 101, + "¤": 102, + "¥": 103, + "¦": 104, + "§": 105, + "¨": 106, + "©": 107, + "ª": 108, + "«": 109, + "¬": 110, + "®": 111, + "¯": 112, + "°": 113, + "±": 114, + "²": 115, + "³": 116, + "´": 117, + "µ": 118, + "¶": 119, + "·": 120, + "¸": 121, + "¹": 122, + "º": 123, + "»": 124, + "¼": 125, + "½": 126, + "¾": 127, + "¿": 128, + "À": 129, + "Á": 130, + "Â": 131, + "Ã": 132, + "Ä": 133, + "Å": 134, + "Æ": 135, + "Ç": 136, + "È": 137, + "É": 138, + "Ê": 139, + "Ë": 140, + "Ì": 141, + "Í": 142, + "Î": 143, + "Ï": 144, + "Ð": 145, + "Ñ": 146, + "Ò": 147, + "Ó": 148, + "Ô": 149, + "Õ": 150, + "Ö": 151, + "×": 152, + "Ø": 153, + "Ù": 154, + "Ú": 155, + "Û": 156, + "Ü": 157, + "Ý": 158, + "Þ": 159, + "ß": 160, + "à": 161, + "á": 162, + "â": 163, + "ã": 164, + "ä": 165, + "å": 166, + "æ": 167, + "ç": 168, + "è": 169, + "é": 170, + "ê": 171, + "ë": 172, + "ì": 173, + "í": 174, + "î": 175, + "ï": 176, + "ð": 177, + "ñ": 178, + "ò": 179, + "ó": 180, + "ô": 181, + "õ": 182, + "ö": 183, + "÷": 184, + "ø": 185, + "ù": 186, + "ú": 187, + "û": 188, + "ü": 189, + "ý": 190, + "þ": 191, + "ÿ": 192, + "Ā": 193, + "ā": 194, + "Ă": 195, + "ă": 196, + "Ą": 197, + "ą": 198, + "Ć": 199, + "ć": 200, + "Ĉ": 201, + "ĉ": 202, + "Ċ": 203, + "ċ": 204, + "Č": 205, + "č": 206, + "Ď": 207, + "ď": 208, + "Đ": 209, + "đ": 210, + "Ē": 211, + "ē": 212, + "Ĕ": 213, + "ĕ": 214, + "Ė": 215, + "ė": 216, + "Ę": 217, + "ę": 218, + "Ě": 219, + "ě": 220, + "Ĝ": 221, + "ĝ": 222, + "Ğ": 223, + "ğ": 224, + "Ġ": 225, + "ġ": 226, + "Ģ": 227, + "ģ": 228, + "Ĥ": 229, + "ĥ": 230, + "Ħ": 231, + "ħ": 232, + "Ĩ": 233, + "ĩ": 234, + "Ī": 235, + "ī": 236, + "Ĭ": 237, + "ĭ": 238, + "Į": 239, + "į": 240, + "İ": 241, + "ı": 242, + "IJ": 243, + "ij": 244, + "Ĵ": 245, + "ĵ": 246, + "Ķ": 247, + "ķ": 248, + "ĸ": 249, + "Ĺ": 250, + "ĺ": 251, + "Ļ": 252, + "ļ": 253, + "Ľ": 254, + "ľ": 255, + "Ŀ": 256, + "ŀ": 257, + "Ł": 258, + "ł": 259, + "Ń": 260, + "an": 261, + "Ġd": 262, + "er": 263, + "en": 264, + "ar": 265, + "Ġm": 266, + "la": 267, + "ang": 268, + "Ġs": 269, + "Ġp": 270, + "in": 271, + "at": 272, + "Ġk": 273, + "Ġt": 274, + "Ġb": 275, + "da": 276, + "Ġdi": 277, + "un": 278, + "as": 279, + "kan": 280, + "em": 281, + "ah": 282, + "al": 283, + "ya": 284, + "am": 285, + "Ġse": 286, + "ada": 287, + "Ġmen": 288, + "si": 289, + "yang": 290, + "Ġdan": 291, + "Ġyang": 292, + "tu": 293, + "on": 294, + "ga": 295, + "ĠS": 296, + "ak": 297, + "ari": 298, + "lah": 299, + "il": 300, + "es": 301, + "or": 302, + "di": 303, + "Ġke": 304, + "is": 305, + "Ġber": 306, + "ik": 307, + "ĠP": 308, + "eng": 309, + "ĠA": 310, + "bu": 311, + "Ġter": 312, + "us": 313, + "ta": 314, + "ol": 315, + "ing": 316, + "el": 317, + "um": 318, + "ĠK": 319, + "ur": 320, + "Ġin": 321, + "ĠM": 322, + "Ġdari": 323, + "ara": 324, + "ba": 325, + "ti": 326, + "nya": 327, + "lam": 328, + "ap": 329, + "ĠB": 330, + "Ġper": 331, + "ek": 332, + "Ġini": 333, + "akan": 334, + "Ġmem": 335, + "it": 336, + "Ġ1": 337, + "uk": 338, + "ĠI": 339, + "ja": 340, + "ĠD": 341, + "Ġ\"": 342, + "ia": 343, + "ĠT": 344, + "Ġada": 345, + "engan": 346, + "Ġpada": 347, + "Ġj": 348, + "up": 349, + "gi": 350, + "tuk": 351, + "Ġdengan": 352, + "Ġh": 353, + "im": 354, + "ul": 355, + "ir": 356, + "alam": 357, + "om": 358, + "Ġ(": 359, + "et": 360, + "asi": 361, + "ut": 362, + "ung": 363, + "ama": 364, + "Ġadalah": 365, + "Ġun": 366, + "Ġmer": 367, + "Ġ2": 368, + "Ġl": 369, + "Ġmeng": 370, + "Ġuntuk": 371, + "ro": 372, + "eb": 373, + "Ġdalam": 374, + "ri": 375, + "esi": 376, + "anya": 377, + "eh": 378, + "ĠC": 379, + "ĠJ": 380, + "Ġpen": 381, + "ter": 382, + "ahun": 383, + "ĠR": 384, + "Ġr": 385, + "gai": 386, + "se": 387, + "ela": 388, + "Ġn": 389, + "ata": 390, + "ĠN": 391, + "Ġf": 392, + "Ġtahun": 393, + "Ġla": 394, + "ĠL": 395, + "oleh": 396, + "wa": 397, + "bagai": 398, + "Ġba": 399, + "Ġg": 400, + "id": 401, + "per": 402, + "Ġoleh": 403, + "Ġa": 404, + "pat": 405, + "ĠH": 406, + "Ġ20": 407, + "Ġju": 408, + "ĠIn": 409, + "emb": 410, + "Ġ19": 411, + "