ewe-tokenizer-v1 / tokenizer.json
worldboss's picture
Upload tokenizer
9f6884d verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": true
},
{
"id": 1,
"content": "a",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "E",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "e",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "Ɛ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "ɛ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "Ə",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 7,
"content": "ə",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 8,
"content": "I",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 9,
"content": "i",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 10,
"content": "O",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 11,
"content": "o",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 12,
"content": "Ɔ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 13,
"content": "ɔ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 14,
"content": "U",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 15,
"content": "u",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 16,
"content": "ã",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 17,
"content": "ẽ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 18,
"content": "ĩ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 19,
"content": "õ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 20,
"content": "ũ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 21,
"content": "B",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 22,
"content": "b",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 23,
"content": "D",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 24,
"content": "d",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 25,
"content": "Đ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 26,
"content": "ɖ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 27,
"content": "F",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 28,
"content": "f",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 29,
"content": "Ƒ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 30,
"content": "ƒ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 31,
"content": "G",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32,
"content": "g",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 33,
"content": "Ɣ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 34,
"content": "ɣ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 35,
"content": "H",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 36,
"content": "h",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 37,
"content": "K",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 38,
"content": "k",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 39,
"content": "L",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 40,
"content": "l",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 41,
"content": "M",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 42,
"content": "m",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 43,
"content": "N",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 44,
"content": "n",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 45,
"content": "Ŋ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 46,
"content": "ŋ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 47,
"content": "P",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 48,
"content": "p",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 49,
"content": "R",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 50,
"content": "r",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 51,
"content": "S",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 52,
"content": "s",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 53,
"content": "T",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 54,
"content": "t",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 55,
"content": "V",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 56,
"content": "v",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 57,
"content": "Ʋ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 58,
"content": "ʋ",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 59,
"content": "W",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 60,
"content": "w",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 61,
"content": "X",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 62,
"content": "x",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 63,
"content": "Y",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 64,
"content": "y",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 65,
"content": "Z",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 66,
"content": "z",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 67,
"content": "̃",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 68,
"content": "ó",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 69,
"content": "À",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 70,
"content": "é",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 71,
"content": "È",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 72,
"content": "ò",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 73,
"content": "à",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 74,
"content": "í",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "",
"end_of_word_suffix": "",
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<|endoftext|>": 0,
"a": 1,
"E": 2,
"e": 3,
"Ɛ": 4,
"ɛ": 5,
"Ə": 6,
"ə": 7,
"I": 8,
"i": 9,
"O": 10,
"o": 11,
"Ɔ": 12,
"ɔ": 13,
"U": 14,
"u": 15,
"ã": 16,
"ẽ": 17,
"ĩ": 18,
"õ": 19,
"ũ": 20,
"B": 21,
"b": 22,
"D": 23,
"d": 24,
"Đ": 25,
"ɖ": 26,
"F": 27,
"f": 28,
"Ƒ": 29,
"ƒ": 30,
"G": 31,
"g": 32,
"Ɣ": 33,
"ɣ": 34,
"H": 35,
"h": 36,
"K": 37,
"k": 38,
"L": 39,
"l": 40,
"M": 41,
"m": 42,
"N": 43,
"n": 44,
"Ŋ": 45,
"ŋ": 46,
"P": 47,
"p": 48,
"R": 49,
"r": 50,
"S": 51,
"s": 52,
"T": 53,
"t": 54,
"V": 55,
"v": 56,
"Ʋ": 57,
"ʋ": 58,
"W": 59,
"w": 60,
"X": 61,
"x": 62,
"Y": 63,
"y": 64,
"Z": 65,
"z": 66,
"̃": 67,
"ó": 68,
"À": 69,
"é": 70,
"È": 71,
"ò": 72,
"à": 73,
"í": 74,
"!": 75,
"\"": 76,
"#": 77,
"$": 78,
"%": 79,
"&": 80,
"'": 81,
"(": 82,
")": 83,
"*": 84,
"+": 85,
",": 86,
"-": 87,
".": 88,
"/": 89,
"0": 90,
"1": 91,
"2": 92,
"3": 93,
"4": 94,
"5": 95,
"6": 96,
"7": 97,
"8": 98,
"9": 99,
":": 100,
";": 101,
"<": 102,
"=": 103,
">": 104,
"?": 105,
"@": 106,
"A": 107,
"C": 108,
"J": 109,
"Q": 110,
"[": 111,
"\\": 112,
"]": 113,
"^": 114,
"_": 115,
"`": 116,
"c": 117,
"j": 118,
"q": 119,
"{": 120,
"|": 121,
"}": 122,
"~": 123,
"¡": 124,
"¢": 125,
"£": 126,
"¤": 127,
"¥": 128,
"¦": 129,
"§": 130,
"¨": 131,
"©": 132,
"ª": 133,
"«": 134,
"¬": 135,
"®": 136,
"¯": 137,
"°": 138,
"±": 139,
"²": 140,
"³": 141,
"´": 142,
"µ": 143,
"¶": 144,
"·": 145,
"¸": 146,
"¹": 147,
"º": 148,
"»": 149,
"¼": 150,
"½": 151,
"¾": 152,
"¿": 153,
"Á": 154,
"Â": 155,
"Ã": 156,
"Ä": 157,
"Å": 158,
"Æ": 159,
"Ç": 160,
"É": 161,
"Ê": 162,
"Ë": 163,
"Ì": 164,
"Í": 165,
"Î": 166,
"Ï": 167,
"Ð": 168,
"Ñ": 169,
"Ò": 170,
"Ó": 171,
"Ô": 172,
"Õ": 173,
"Ö": 174,
"×": 175,
"Ø": 176,
"Ù": 177,
"Ú": 178,
"Û": 179,
"Ü": 180,
"Ý": 181,
"Þ": 182,
"ß": 183,
"á": 184,
"â": 185,
"ä": 186,
"å": 187,
"æ": 188,
"ç": 189,
"è": 190,
"ê": 191,
"ë": 192,
"ì": 193,
"î": 194,
"ï": 195,
"ð": 196,
"ñ": 197,
"ô": 198,
"ö": 199,
"÷": 200,
"ø": 201,
"ù": 202,
"ú": 203,
"û": 204,
"ü": 205,
"ý": 206,
"þ": 207,
"ÿ": 208,
"Ā": 209,
"ā": 210,
"Ă": 211,
"ă": 212,
"Ą": 213,
"ą": 214,
"Ć": 215,
"ć": 216,
"Ĉ": 217,
"ĉ": 218,
"Ċ": 219,
"ċ": 220,
"Č": 221,
"č": 222,
"Ď": 223,
"ď": 224,
"đ": 225,
"Ē": 226,
"ē": 227,
"Ĕ": 228,
"ĕ": 229,
"Ė": 230,
"ė": 231,
"Ę": 232,
"ę": 233,
"Ě": 234,
"ě": 235,
"Ĝ": 236,
"ĝ": 237,
"Ğ": 238,
"ğ": 239,
"Ġ": 240,
"ġ": 241,
"Ģ": 242,
"ģ": 243,
"Ĥ": 244,
"ĥ": 245,
"Ħ": 246,
"ħ": 247,
"Ĩ": 248,
"Ī": 249,
"ī": 250,
"Ĭ": 251,
"ĭ": 252,
"Į": 253,
"į": 254,
"İ": 255,
"ı": 256,
"IJ": 257,
"ij": 258,
"Ĵ": 259,
"ĵ": 260,
"Ķ": 261,
"ķ": 262,
"ĸ": 263,
"Ĺ": 264,
"ĺ": 265,
"Ļ": 266,
"ļ": 267,
"Ľ": 268,
"ľ": 269,
"Ŀ": 270,
"ŀ": 271,
"Ł": 272,
"ł": 273,
"Ń": 274
},
"merges": []
}
}