urlbert-tiny-base-v1 / tokenizer.json
CrabInHoney's picture
Upload 8 files
fc3e419 verified
raw
history blame
12 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": true,
"strip_accents": null,
"lowercase": true
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"&": 5,
"'": 6,
"*": 7,
",": 8,
"-": 9,
".": 10,
"/": 11,
"0": 12,
"1": 13,
"2": 14,
"3": 15,
"4": 16,
"5": 17,
"6": 18,
"7": 19,
"8": 20,
"9": 21,
":": 22,
";": 23,
"_": 24,
"a": 25,
"b": 26,
"c": 27,
"d": 28,
"e": 29,
"f": 30,
"g": 31,
"h": 32,
"i": 33,
"j": 34,
"k": 35,
"l": 36,
"m": 37,
"n": 38,
"o": 39,
"p": 40,
"q": 41,
"r": 42,
"s": 43,
"t": 44,
"u": 45,
"v": 46,
"w": 47,
"x": 48,
"y": 49,
"z": 50,
"ã": 51,
"ä": 52,
"å": 53,
"æ": 54,
"ç": 55,
"é": 56,
"ë": 57,
"í": 58,
"ï": 59,
"ñ": 60,
"ó": 61,
"ô": 62,
"õ": 63,
"ö": 64,
"ø": 65,
"ú": 66,
"ü": 67,
"ń": 68,
"а": 69,
"г": 70,
"д": 71,
"е": 72,
"и": 73,
"н": 74,
"о": 75,
"р": 76,
"с": 77,
"ф": 78,
"я": 79,
"加": 80,
"大": 81,
"学": 82,
"師": 83,
"祥": 84,
"講": 85,
"贝": 86,
"通": 87,
"##a": 88,
"##t": 89,
"##h": 90,
"##l": 91,
"##i": 92,
"##n": 93,
"##e": 94,
"##d": 95,
"##y": 96,
"##o": 97,
"##u": 98,
"##b": 99,
"##6": 100,
"##s": 101,
"##c": 102,
"##r": 103,
"##p": 104,
"##k": 105,
"##j": 106,
"##g": 107,
"##w": 108,
"##m": 109,
"##z": 110,
"##v": 111,
"##q": 112,
"##f": 113,
"##x": 114,
"##8": 115,
"##1": 116,
"##2": 117,
"##0": 118,
"##7": 119,
"##9": 120,
"##3": 121,
"##4": 122,
"##5": 123,
"##ø": 124,
"##æ": 125,
"##ú": 126,
"##ä": 127,
"##ñ": 128,
"##é": 129,
"##е": 130,
"##ç": 131,
"##ã": 132,
"##õ": 133,
"##í": 134,
"##ф": 135,
"##ë": 136,
"##о": 137,
"##с": 138,
"##и": 139,
"##я": 140,
"##г": 141,
"##д": 142,
"##н": 143,
"##通": 144,
"##贝": 145,
"##祥": 146,
"##ń": 147,
"##ï": 148,
"##学": 149,
"##講": 150,
"##師": 151,
"##ö": 152,
"##ô": 153,
"##å": 154,
"##ó": 155,
"##ü": 156,
"##р": 157,
"##а": 158,
"co": 159,
"com": 160,
"##er": 161,
"##in": 162,
"##ar": 163,
"##ou": 164,
"##es": 165,
"##or": 166,
"##nt": 167,
"##et": 168,
"##al": 169,
"##ro": 170,
"##il": 171,
"##el": 172,
"##ic": 173,
"##ec": 174,
"##at": 175,
"##on": 176,
"##ac": 177,
"##it": 178,
"##is": 179,
"##an": 180,
"##ew": 181,
"##rc": 182,
"pro": 183,
"##ing": 184,
"##out": 185,
"##tp": 186,
"##ts": 187,
"##ap": 188,
"##og": 189,
"re": 190,
"##as": 191,
"##rg": 192,
"##pp": 193,
"org": 194,
"##am": 195,
"##ile": 196,
"##op": 197,
"st": 198,
"##ti": 199,
"##em": 200,
"ma": 201,
"##ol": 202,
"##ri": 203,
"ch": 204,
"##ort": 205,
"##ews": 206,
"##ers": 207,
"##en": 208,
"##ms": 209,
"##ch": 210,
"ab": 211,
"##du": 212,
"about": 213,
"##me": 214,
"##art": 215,
"su": 216,
"##re": 217,
"##nc": 218,
"##net": 219,
"##ad": 220,
"##ent": 221,
"##ourc": 222,
"tel": 223,
"##eam": 224,
"##dap": 225,
"ws": 226,
"se": 227,
"ldap": 228,
"ht": 229,
"do": 230,
"irc": 231,
"http": 232,
"ac": 233,
"##ter": 234,
"sh": 235,
"news": 236,
"bl": 237,
"##pi": 238,
"##erv": 239,
"##ard": 240,
"##ting": 241,
"##bo": 242,
"mail": 243,
"##ervic": 244,
"ad": 245,
"shop": 246,
"uk": 247,
"blog": 248,
"in": 249,
"api": 250,
"##ervices": 251,
"##st": 252,
"##arch": 253,
"##eg": 254,
"##min": 255,
"##act": 256,
"##duc": 257,
"##ash": 258,
"##ount": 259,
"##lo": 260,
"##riv": 261,
"acc": 262,
"br": 263,
"##elp": 264,
"cont": 265,
"ter": 266,
"log": 267,
"##eck": 268,
"reg": 269,
"##acy": 270,
"set": 271,
"##tings": 272,
"##pport": 273,
"prof": 274,
"##ducts": 275,
"##ister": 276,
"search": 277,
"##board": 278,
"settings": 279,
"priv": 280,
"terms": 281,
"services": 282,
"help": 283,
"register": 284,
"support": 285,
"account": 286,
"dash": 287,
"dashboard": 288,
"privacy": 289,
"login": 290,
"##mp": 291,
"contact": 292,
"check": 293,
"admin": 294,
"profile": 295,
"checkout": 296,
"products": 297,
"##io": 298,
"##eb": 299,
"##are": 300,
"##um": 301,
"##po": 302,
"##to": 303,
"##sh": 304,
"##all": 305,
"##vi": 306,
"nl": 307,
"##pe": 308,
"de": 309,
"##her": 310,
"##sy": 311,
"##ies": 312,
"##ial": 313,
"##ric": 314,
"##ob": 315,
"##rou": 316,
"sk": 317,
"sc": 318,
"vi": 319,
"au": 320,
"##cs": 321,
"##ners": 322,
"##ata": 323,
"##dis": 324,
"##roup": 325,
"##vent": 326,
"##ex": 327,
"##artners": 328,
"##ion": 329,
"##ag": 330,
"sky": 331,
"##wn": 332,
"##ource": 333,
"##cal": 334,
"##ery": 335,
"##mon": 336,
"##ed": 337,
"rt": 338,
"##fy": 339,
"for": 340,
"sf": 341,
"web": 342,
"res": 343,
"mag": 344,
"sv": 345,
"skype": 346,
"##co": 347,
"fa": 348,
"rtmp": 349,
"spo": 350,
"##tsp": 351,
"git": 352,
"gop": 353,
"xm": 354,
"data": 355,
"cat": 356,
"##rome": 357,
"xmpp": 358,
"##tify": 359,
"spotify": 360,
"##ntp": 361,
"gopher": 362,
"sftp": 363,
"##ects": 364,
"source": 365,
"ftp": 366,
"port": 367,
"##sync": 368,
"net": 369,
"rtsp": 370,
"view": 371,
"rsync": 372,
"vnc": 373,
"https": 374,
"chrome": 375,
"##vents": 376,
"ssh": 377,
"ldaps": 378,
"mailto": 379,
"wss": 380,
"nntp": 381,
"ircs": 382,
"scp": 383,
"svn": 384,
"telnet": 385,
"mms": 386,
"##ources": 387,
"magnet": 388,
"redis": 389,
"##esti": 390,
"##fol": 391,
"steam": 392,
"webcal": 393,
"file": 394,
"team": 395,
"##ads": 396,
"events": 397,
"##ials": 398,
"care": 399,
"sit": 400,
"##folio": 401,
"##jects": 402,
"portfolio": 403,
"partners": 404,
"##ure": 405,
"revi": 406,
"cart": 407,
"resources": 408,
"forum": 409,
"testi": 410,
"##monials": 411,
"testimonials": 412,
"reviews": 413,
"gall": 414,
"##emap": 415,
"##ories": 416,
"pric": 417,
"##us": 418,
"sitemap": 419,
"projects": 420,
"##and": 421,
"docs": 422,
"gallery": 423,
"faq": 424,
"down": 425,
"pricing": 426,
"##egories": 427,
"categories": 428,
"##loads": 429,
"downloads": 430,
"careers": 431,
"##ig": 432,
"##ore": 433,
"##ia": 434,
"##ur": 435,
"##sc": 436,
"##eta": 437,
"sec": 438,
"##ww": 439,
"##emo": 440,
"cd": 441,
"mob": 442,
"app": 443,
"##atic": 444,
"www": 445,
"dev": 446,
"mobile": 447,
"secure": 448,
"cdn": 449,
"store": 450,
"demo": 451,
"beta": 452,
"##ech": 453,
"static": 454,
"ho": 455,
"##ra": 456,
"##ons": 457,
"##le": 458,
"ca": 459,
"##ul": 460,
"##be": 461,
"it": 462,
"##ov": 463,
"##ir": 464,
"##group": 465,
"edu": 466,
"fr": 467,
"##th": 468,
"##id": 469,
"##os": 470,
"##om": 471,
"ind": 472,
"##ab": 473,
"##un": 474,
"##ot": 475,
"##est": 476,
"##ay": 477,
"##ation": 478,
"home": 479,
"##ru": 480,
"##anc": 481,
"th": 482,
"sub": 483,
"##ribe": 484,
"index": 485,
"##od": 486,
"##scribe": 487,
"subscribe": 488,
"##of": 489,
"al": 490,
"##oc": 491,
"##ity": 492,
"##iv": 493,
"##ine": 494,
"##inc": 495,
"##im": 496,
"ar": 497,
"tr": 498,
"es": 499
}
}
}