French-Tortoise / testing /french_tokenizer.json
Snowad's picture
Rename french_tokenizer.json to testing/french_tokenizer.json
4ec1546
raw
history blame
7.43 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[STOP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SPACE]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"vocab": {
"[STOP]": 0,
"[UNK]": 1,
"[SPACE]": 2,
"!": 3,
"(": 4,
")": 5,
",": 6,
"-": 7,
".": 8,
"/": 9,
"0": 10,
"1": 11,
"2": 12,
"3": 13,
"4": 14,
"5": 15,
"6": 16,
"7": 17,
"8": 18,
"9": 19,
":": 20,
";": 21,
"?": 22,
"A": 23,
"B": 24,
"C": 25,
"D": 26,
"E": 27,
"F": 28,
"G": 29,
"H": 30,
"I": 31,
"J": 32,
"K": 33,
"L": 34,
"M": 35,
"N": 36,
"O": 37,
"P": 38,
"Q": 39,
"R": 40,
"S": 41,
"T": 42,
"U": 43,
"V": 44,
"W": 45,
"X": 46,
"Y": 47,
"Z": 48,
"a": 49,
"b": 50,
"c": 51,
"d": 52,
"e": 53,
"f": 54,
"g": 55,
"h": 56,
"i": 57,
"j": 58,
"k": 59,
"l": 60,
"m": 61,
"n": 62,
"o": 63,
"p": 64,
"q": 65,
"r": 66,
"s": 67,
"t": 68,
"u": 69,
"v": 70,
"w": 71,
"x": 72,
"y": 73,
"z": 74,
"«": 75,
"»": 76,
"À": 77,
"Ç": 78,
"É": 79,
"Ô": 80,
"à": 81,
"â": 82,
"ç": 83,
"è": 84,
"é": 85,
"ê": 86,
"ë": 87,
"î": 88,
"ï": 89,
"ô": 90,
"ù": 91,
"û": 92,
"œ": 93,
"–": 94,
"’": 95,
"le": 96,
"en": 97,
"ai": 98,
"es": 99,
"ou": 100,
"on": 101,
"de": 102,
"re": 103,
"an": 104,
"qu": 105,
"et": 106,
"er": 107,
"ent": 108,
"il": 109,
"la": 110,
"ait": 111,
"ne": 112,
"it": 113,
"ar": 114,
"au": 115,
"ch": 116,
"is": 117,
"ur": 118,
"se": 119,
"el": 120,
"te": 121,
"in": 122,
"les": 123,
"our": 124,
"av": 125,
"or": 126,
"eu": 127,
"ant": 128,
"ais": 129,
"lu": 130,
"me": 131,
"que": 132,
"pr": 133,
"as": 134,
"om": 135,
"ce": 136,
"oi": 137,
"elle": 138,
"un": 139,
"ét": 140,
"des": 141,
"ous": 142,
"ans": 143,
"tr": 144,
"par": 145,
"si": 146,
"al": 147,
"une": 148,
"du": 149,
"ri": 150,
"est": 151,
"qui": 152,
"son": 153,
"aient": 154,
"pas": 155,
"tre": 156,
"che": 157,
"dans": 158,
"pour": 159,
"sa": 160,
"vi": 161,
"ment": 162,
"ti": 163,
"leur": 164,
"ré": 165,
"con": 166,
"out": 167,
"ère": 168,
"com": 169,
"dit": 170,
"ois": 171,
"ouv": 172,
"ge": 173,
"avait": 174,
"lui": 175,
"était": 176,
"plu": 177,
"ra": 178,
"mais": 179,
"ain": 180,
"eux": 181,
"bi": 182,
"je": 183,
"plus": 184,
"ses": 185,
"ir": 186,
"Il": 187,
"dé": 188,
"ma": 189,
"end": 190,
"mon": 191,
"ils": 192,
"ée": 193,
"and": 194,
"sur": 195,
"li": 196,
"ci": 197,
"ille": 198,
"vous": 199,
"vo": 200,
"bien": 201,
"mp": 202,
"ap": 203,
"ave": 204,
"ser": 205,
"oir": 206,
"tout": 207,
"mi": 208,
"su": 209,
"don": 210,
"rent": 211,
"tu": 212,
"gr": 213,
"ces": 214,
"etit": 215,
"ier": 216,
"ter": 217,
"co": 218,
"petit": 219,
"ab": 220,
"ons": 221,
"jour": 222,
"comme": 223,
"ro": 224,
"res": 225,
"avec": 226,
"eau": 227,
"és": 228,
"ort": 229,
"éc": 230,
"uis": 231,
"Le": 232,
"ut": 233,
"èrent": 234,
"ff": 235,
"lle": 236,
"oin": 237,
"gar": 238,
"gn": 239,
"enf": 240,
"us": 241,
"ors": 242,
"br": 243,
"ès": 244,
"ta": 245,
"quel": 246,
"ette": 247,
"cou": 248,
"mes": 249,
"vie": 250,
"di": 251,
"gu": 252,
"aire": 253,
"per": 254
},
"merges": [
"l e",
"e n",
"a i",
"e s",
"o u",
"o n",
"d e",
"r e",
"a n",
"q u",
"e t",
"e r",
"en t",
"i l",
"l a",
"ai t",
"n e",
"i t",
"a r",
"a u",
"c h",
"i s",
"u r",
"s e",
"e l",
"t e",
"i n",
"le s",
"ou r",
"a v",
"o r",
"e u",
"an t",
"ai s",
"l u",
"m e",
"qu e",
"p r",
"a s",
"o m",
"c e",
"o i",
"el le",
"u n",
"é t",
"d es",
"ou s",
"an s",
"t r",
"p ar",
"s i",
"a l",
"u ne",
"d u",
"r i",
"es t",
"qu i",
"s on",
"ai ent",
"p as",
"t re",
"ch e",
"d ans",
"p our",
"s a",
"v i",
"m ent",
"t i",
"le ur",
"r é",
"c on",
"ou t",
"è re",
"c om",
"d it",
"o is",
"ou v",
"g e",
"av ait",
"lu i",
"ét ait",
"p lu",
"r a",
"m ais",
"ai n",
"eu x",
"b i",
"j e",
"plu s",
"s es",
"i r",
"I l",
"d é",
"m a",
"en d",
"m on",
"il s",
"é e",
"an d",
"s ur",
"l i",
"c i",
"il le",
"v ous",
"v o",
"bi en",
"m p",
"a p",
"av e",
"s er",
"oi r",
"t out",
"m i",
"s u",
"d on",
"r ent",
"t u",
"g r",
"c es",
"et it",
"i er",
"t er",
"c o",
"p etit",
"a b",
"on s",
"j our",
"com me",
"r o",
"r es",
"ave c",
"e au",
"é s",
"or t",
"é c",
"u is",
"L e",
"u t",
"è rent",
"f f",
"l le",
"o in",
"g ar",
"g n",
"en f",
"u s",
"or s",
"b r",
"è s",
"t a",
"qu el",
"et te",
"c ou",
"m es",
"vi e",
"d i",
"g u",
"ai re",
"p er"
]
}
}