manga-ocr / tokenizer.json
Dnouv
addd tokenizer
b52acc2
raw
history blame
21.2 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": true,
"strip_accents": null,
"lowercase": true
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": null,
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"%": 5,
"+": 6,
"0": 7,
"1": 8,
"2": 9,
"3": 10,
"4": 11,
"5": 12,
"6": 13,
"7": 14,
"8": 15,
"9": 16,
"<": 17,
"=": 18,
">": 19,
"?": 20,
"[": 21,
"]": 22,
"a": 23,
"b": 24,
"c": 25,
"d": 26,
"e": 27,
"f": 28,
"g": 29,
"h": 30,
"i": 31,
"j": 32,
"k": 33,
"l": 34,
"m": 35,
"n": 36,
"o": 37,
"p": 38,
"q": 39,
"r": 40,
"s": 41,
"t": 42,
"u": 43,
"v": 44,
"w": 45,
"x": 46,
"y": 47,
"z": 48,
"·": 49,
"¿": 50,
"æ": 51,
"ø": 52,
"þ": 53,
"đ": 54,
"ı": 55,
"ł": 56,
"ɨ": 57,
"ɹ": 58,
"ʁ": 59,
"ʌ": 60,
"ʿ": 61,
"α": 62,
"β": 63,
"γ": 64,
"δ": 65,
"ε": 66,
"ζ": 67,
"η": 68,
"θ": 69,
"ι": 70,
"κ": 71,
"λ": 72,
"μ": 73,
"ν": 74,
"ξ": 75,
"ο": 76,
"π": 77,
"ρ": 78,
"σ": 79,
"τ": 80,
"υ": 81,
"φ": 82,
"χ": 83,
"ψ": 84,
"ω": 85,
"а": 86,
"б": 87,
"в": 88,
"г": 89,
"д": 90,
"е": 91,
"ж": 92,
"з": 93,
"и": 94,
"к": 95,
"л": 96,
"м": 97,
"н": 98,
"о": 99,
"п": 100,
"р": 101,
"с": 102,
"т": 103,
"у": 104,
"ф": 105,
"х": 106,
"ц": 107,
"ч": 108,
"ш": 109,
"э": 110,
"ю": 111,
"я": 112,
"і": 113,
"ј": 114,
"ա": 115,
"գ": 116,
"յ": 117,
"א": 118,
"ט": 119,
"ם": 120,
"ש": 121,
"ا": 122,
"ث": 123,
"د": 124,
"س": 125,
"ش": 126,
"ي": 127,
"ण": 128,
"प": 129,
"ा": 130,
"া": 131,
"ய": 132,
"ප": 133,
"ර": 134,
"ව": 135,
"ศ": 136,
"ນ": 137,
"ເ": 138,
"་": 139,
"བ": 140,
"გ": 141,
"თ": 142,
"ქ": 143,
"წ": 144,
"ᄀ": 145,
"ᄂ": 146,
"ᄃ": 147,
"ᄅ": 148,
"ᄆ": 149,
"ᄇ": 150,
"ᄉ": 151,
"ᄋ": 152,
"ᄌ": 153,
"ᄎ": 154,
"ᄏ": 155,
"ᄐ": 156,
"ᄑ": 157,
"ᄒ": 158,
"ᅡ": 159,
"ᅢ": 160,
"ᅣ": 161,
"ᅥ": 162,
"ᅦ": 163,
"ᅧ": 164,
"ᅩ": 165,
"ᅪ": 166,
"ᅭ": 167,
"ᅮ": 168,
"ᅯ": 169,
"ᅱ": 170,
"ᅲ": 171,
"ᅳ": 172,
"ᅴ": 173,
"ᅵ": 174,
"ᆨ": 175,
"ᆫ": 176,
"ᆯ": 177,
"ᆷ": 178,
"ᆸ": 179,
"ᆼ": 180,
"ង": 181,
"ᠠ": 182,
"ᠢ": 183,
"‐": 184,
"„": 185,
"‡": 186,
"‧": 187,
"↑": 188,
"∂": 189,
"∞": 190,
"≦": 191,
"≪": 192,
"⊂": 193,
"⋅": 194,
"⋯": 195,
"┏": 196,
"◆": 197,
"◇": 198,
"♀": 199,
"♭": 200,
"。": 201,
"〕": 202,
"ぇ": 203,
"か": 204,
"き": 205,
"く": 206,
"け": 207,
"こ": 208,
"さ": 209,
"し": 210,
"す": 211,
"せ": 212,
"そ": 213,
"た": 214,
"ち": 215,
"つ": 216,
"て": 217,
"と": 218,
"は": 219,
"ひ": 220,
"ふ": 221,
"へ": 222,
"ほ": 223,
"や": 224,
"ゅ": 225,
"ゝ": 226,
"ウ": 227,
"ェ": 228,
"カ": 229,
"キ": 230,
"ク": 231,
"ケ": 232,
"コ": 233,
"サ": 234,
"シ": 235,
"ス": 236,
"セ": 237,
"ソ": 238,
"タ": 239,
"チ": 240,
"ツ": 241,
"テ": 242,
"ト": 243,
"ハ": 244,
"ヒ": 245,
"フ": 246,
"ヘ": 247,
"ホ": 248,
"メ": 249,
"ラ": 250,
"ヱ": 251,
"ヵ": 252,
"・": 253,
"万": 254,
"丘": 255,
"丹": 256,
"主": 257,
"久": 258,
"乐": 259,
"乖": 260,
"了": 261,
"亢": 262,
"享": 263,
"亮": 264,
"今": 265,
"仮": 266,
"件": 267,
"伐": 268,
"住": 269,
"佑": 270,
"佟": 271,
"佳": 272,
"侮": 273,
"侯": 274,
"侵": 275,
"俘": 276,
"俟": 277,
"俣": 278,
"俸": 279,
"個": 280,
"倍": 281,
"候": 282,
"倩": 283,
"倶": 284,
"偕": 285,
"偵": 286,
"傍": 287,
"傑": 288,
"傳": 289,
"債": 290,
"僅": 291,
"儂": 292,
"償": 293,
"兌": 294,
"兪": 295,
"共": 296,
"内": 297,
"写": 298,
"冠": 299,
"冲": 300,
"冷": 301,
"准": 302,
"出": 303,
"刁": 304,
"到": 305,
"刻": 306,
"剛": 307,
"剽": 308,
"劔": 309,
"勃": 310,
"勉": 311,
"勤": 312,
"匕": 313,
"區": 314,
"協": 315,
"博": 316,
"卡": 317,
"卦": 318,
"厄": 319,
"厙": 320,
"原": 321,
"厠": 322,
"又": 323,
"双": 324,
"受": 325,
"叛": 326,
"只": 327,
"合": 328,
"吋": 329,
"含": 330,
"吹": 331,
"呑": 332,
"呟": 333,
"和": 334,
"咥": 335,
"咲": 336,
"品": 337,
"哈": 338,
"哲": 339,
"唯": 340,
"唱": 341,
"啄": 342,
"喉": 343,
"喋": 344,
"喩": 345,
"喪": 346,
"喫": 347,
"嘲": 348,
"嘴": 349,
"噌": 350,
"噛": 351,
"嚢": 352,
"嚴": 353,
"囂": 354,
"因": 355,
"囿": 356,
"圏": 357,
"圖": 358,
"圳": 359,
"圻": 360,
"坊": 361,
"坑": 362,
"垠": 363,
"垣": 364,
"埃": 365,
"城": 366,
"堕": 367,
"塩": 368,
"墓": 369,
"壌": 370,
"壕": 371,
"壮": 372,
"大": 373,
"奉": 374,
"奔": 375,
"奮": 376,
"奸": 377,
"妍": 378,
"妓": 379,
"姓": 380,
"姶": 381,
"娟": 382,
"婦": 383,
"媚": 384,
"嫉": 385,
"嫡": 386,
"嬪": 387,
"存": 388,
"孟": 389,
"学": 390,
"孫": 391,
"安": 392,
"宜": 393,
"宥": 394,
"宸": 395,
"容": 396,
"寛": 397,
"对": 398,
"寺": 399,
"寿": 400,
"對": 401,
"展": 402,
"岔": 403,
"峯": 404,
"崋": 405,
"崑": 406,
"崔": 407,
"崙": 408,
"嵊": 409,
"嵯": 410,
"嶝": 411,
"嶼": 412,
"嶽": 413,
"川": 414,
"巣": 415,
"帰": 416,
"幅": 417,
"平": 418,
"幹": 419,
"庄": 420,
"度": 421,
"廣": 422,
"廩": 423,
"延": 424,
"弈": 425,
"弉": 426,
"张": 427,
"弦": 428,
"彅": 429,
"彌": 430,
"形": 431,
"彤": 432,
"徨": 433,
"循": 434,
"徽": 435,
"忘": 436,
"忻": 437,
"怠": 438,
"恐": 439,
"恬": 440,
"恵": 441,
"悉": 442,
"悝": 443,
"惚": 444,
"惟": 445,
"惣": 446,
"愛": 447,
"慇": 448,
"慈": 449,
"態": 450,
"憎": 451,
"憑": 452,
"憔": 453,
"憤": 454,
"憺": 455,
"懇": 456,
"懋": 457,
"懲": 458,
"懼": 459,
"戊": 460,
"戎": 461,
"我": 462,
"戛": 463,
"截": 464,
"戯": 465,
"才": 466,
"扎": 467,
"抵": 468,
"拘": 469,
"招": 470,
"拠": 471,
"按": 472,
"挟": 473,
"振": 474,
"捏": 475,
"捗": 476,
"捜": 477,
"捻": 478,
"掘": 479,
"掣": 480,
"掲": 481,
"掾": 482,
"揃": 483,
"揖": 484,
"援": 485,
"揺": 486,
"搭": 487,
"摩": 488,
"摸": 489,
"攘": 490,
"攪": 491,
"救": 492,
"整": 493,
"斉": 494,
"斑": 495,
"斗": 496,
"斬": 497,
"斯": 498,
"旁": 499,
"旋": 500,
"族": 501,
"昆": 502,
"易": 503,
"昕": 504,
"星": 505,
"春": 506,
"晁": 507,
"晰": 508,
"晴": 509,
"暇": 510,
"暗": 511,
"暦": 512,
"暲": 513,
"暴": 514,
"曙": 515,
"曜": 516,
"曝": 517,
"朋": 518,
"朗": 519,
"朦": 520,
"朧": 521,
"朶": 522,
"机": 523,
"杞": 524,
"杰": 525,
"杲": 526,
"枚": 527,
"柘": 528,
"柳": 529,
"栓": 530,
"栖": 531,
"栩": 532,
"桧": 533,
"桿": 534,
"梗": 535,
"梵": 536,
"梶": 537,
"棒": 538,
"棲": 539,
"植": 540,
"楕": 541,
"楮": 542,
"極": 543,
"槌": 544,
"樂": 545,
"樊": 546,
"樒": 547,
"樟": 548,
"横": 549,
"樫": 550,
"樵": 551,
"檀": 552,
"檬": 553,
"檮": 554,
"檸": 555,
"櫛": 556,
"欲": 557,
"欽": 558,
"歆": 559,
"歳": 560,
"歸": 561,
"残": 562,
"殲": 563,
"殴": 564,
"氏": 565,
"汐": 566,
"汝": 567,
"池": 568,
"沃": 569,
"没": 570,
"沱": 571,
"河": 572,
"油": 573,
"沾": 574,
"況": 575,
"泊": 576,
"泗": 577,
"泣": 578,
"注": 579,
"洋": 580,
"洒": 581,
"津": 582,
"洪": 583,
"浄": 584,
"浦": 585,
"海": 586,
"涙": 587,
"涿": 588,
"淀": 589,
"淳": 590,
"渉": 591,
"渕": 592,
"渝": 593,
"測": 594,
"渭": 595,
"湍": 596,
"溯": 597,
"滄": 598,
"滇": 599,
"滎": 600,
"滕": 601,
"滷": 602,
"滾": 603,
"漉": 604,
"漫": 605,
"漬": 606,
"漸": 607,
"漿": 608,
"潜": 609,
"潴": 610,
"潼": 611,
"澎": 612,
"澪": 613,
"濤": 614,
"濫": 615,
"濵": 616,
"瀛": 617,
"瀞": 618,
"瀾": 619,
"灼": 620,
"炸": 621,
"烽": 622,
"焙": 623,
"熈": 624,
"熕": 625,
"熹": 626,
"燃": 627,
"燎": 628,
"營": 629,
"燵": 630,
"爻": 631,
"犀": 632,
"犁": 633,
"状": 634,
"狂": 635,
"狩": 636,
"狭": 637,
"狼": 638,
"猫": 639,
"玖": 640,
"琥": 641,
"琰": 642,
"琶": 643,
"瑜": 644,
"瑟": 645,
"瑣": 646,
"瑶": 647,
"璧": 648,
"瓌": 649,
"瓔": 650,
"瓠": 651,
"甌": 652,
"甑": 653,
"甕": 654,
"甫": 655,
"甲": 656,
"畚": 657,
"畝": 658,
"畠": 659,
"畢": 660,
"畦": 661,
"當": 662,
"疇": 663,
"疎": 664,
"疑": 665,
"疣": 666,
"疫": 667,
"疱": 668,
"痘": 669,
"痢": 670,
"痰": 671,
"痴": 672,
"瘍": 673,
"白": 674,
"皇": 675,
"皐": 676,
"皝": 677,
"盒": 678,
"目": 679,
"相": 680,
"盾": 681,
"省": 682,
"眷": 683,
"眸": 684,
"睡": 685,
"睿": 686,
"矢": 687,
"砂": 688,
"砺": 689,
"磁": 690,
"磧": 691,
"磯": 692,
"礁": 693,
"礎": 694,
"礪": 695,
"社": 696,
"祓": 697,
"禍": 698,
"禎": 699,
"禧": 700,
"科": 701,
"秒": 702,
"秣": 703,
"秩": 704,
"程": 705,
"稍": 706,
"稙": 707,
"稻": 708,
"穂": 709,
"穢": 710,
"突": 711,
"窒": 712,
"窠": 713,
"窯": 714,
"竈": 715,
"童": 716,
"笙": 717,
"笥": 718,
"笹": 719,
"筒": 720,
"筰": 721,
"筵": 722,
"箋": 723,
"箍": 724,
"箒": 725,
"節": 726,
"篩": 727,
"籌": 728,
"籾": 729,
"粒": 730,
"粕": 731,
"粘": 732,
"粥": 733,
"糧": 734,
"紊": 735,
"紘": 736,
"索": 737,
"紳": 738,
"絡": 739,
"絢": 740,
"絨": 741,
"絶": 742,
"綵": 743,
"線": 744,
"緞": 745,
"縉": 746,
"繚": 747,
"级": 748,
"罫": 749,
"置": 750,
"美": 751,
"羲": 752,
"翟": 753,
"翫": 754,
"翳": 755,
"耿": 756,
"聊": 757,
"聟": 758,
"聲": 759,
"聳": 760,
"聾": 761,
"肱": 762,
"肺": 763,
"胚": 764,
"胤": 765,
"胴": 766,
"能": 767,
"脅": 768,
"脈": 769,
"脱": 770,
"脹": 771,
"腐": 772,
"腑": 773,
"膨": 774,
"臻": 775,
"臼": 776,
"舁": 777,
"舌": 778,
"舛": 779,
"舶": 780,
"艙": 781,
"艮": 782,
"芋": 783,
"芍": 784,
"芝": 785,
"芥": 786,
"苔": 787,
"苗": 788,
"苴": 789,
"茄": 790,
"茫": 791,
"茸": 792,
"荏": 793,
"荘": 794,
"莇": 795,
"菊": 796,
"萇": 797,
"萍": 798,
"落": 799,
"葆": 800,
"葛": 801,
"葦": 802,
"蒯": 803,
"蒴": 804,
"蔓": 805,
"蔣": 806,
"蔵": 807,
"蕪": 808,
"薇": 809,
"薙": 810,
"藻": 811,
"蘄": 812,
"蘊": 813,
"蘋": 814,
"蘭": 815,
"蘿": 816,
"虔": 817,
"蛛": 818,
"蛟": 819,
"蜘": 820,
"蜴": 821,
"蜷": 822,
"蝙": 823,
"蝶": 824,
"蝸": 825,
"螂": 826,
"螺": 827,
"蟇": 828,
"衆": 829,
"行": 830,
"衞": 831,
"衢": 832,
"衫": 833,
"衾": 834,
"袢": 835,
"袿": 836,
"要": 837,
"覈": 838,
"視": 839,
"覗": 840,
"訂": 841,
"訃": 842,
"訛": 843,
"訪": 844,
"詛": 845,
"詭": 846,
"詰": 847,
"詵": 848,
"詹": 849,
"誨": 850,
"請": 851,
"論": 852,
"諮": 853,
"諱": 854,
"諶": 855,
"謔": 856,
"謡": 857,
"謨": 858,
"譙": 859,
"讐": 860,
"豆": 861,
"豊": 862,
"豬": 863,
"豳": 864,
"豹": 865,
"貘": 866,
"貪": 867,
"貴": 868,
"貸": 869,
"賑": 870,
"賛": 871,
"賠": 872,
"賤": 873,
"贋": 874,
"贍": 875,
"赛": 876,
"赤": 877,
"趙": 878,
"跛": 879,
"跳": 880,
"踊": 881,
"踞": 882,
"踪": 883,
"蹊": 884,
"蹋": 885,
"躁": 886,
"躍": 887,
"軀": 888,
"軻": 889,
"輓": 890,
"輔": 891,
"輯": 892,
"輳": 893,
"轡": 894,
"迎": 895,
"近": 896,
"述": 897,
"迴": 898,
"逃": 899,
"逍": 900,
"逞": 901,
"連": 902,
"逼": 903,
"適": 904,
"遮": 905,
"遷": 906,
"遽": 907,
"避": 908,
"邀": 909,
"邏": 910,
"郊": 911,
"郯": 912,
"酌": 913,
"酎": 914,
"酛": 915,
"酢": 916,
"酸": 917,
"醇": 918,
"醒": 919,
"釋": 920,
"量": 921,
"釐": 922,
"金": 923,
"釜": 924,
"針": 925,
"鈍": 926,
"鈴": 927,
"鉈": 928,
"鉛": 929,
"鉞": 930,
"銃": 931,
"銕": 932,
"銚": 933,
"鋪": 934,
"錆": 935,
"錠": 936,
"鍮": 937,
"鎬": 938,
"鐔": 939,
"鐸": 940,
"鑚": 941,
"鑢": 942,
"開": 943,
"間": 944,
"閔": 945,
"閨": 946,
"闘": 947,
"陋": 948,
"陟": 949,
"院": 950,
"陳": 951,
"陶": 952,
"隘": 953,
"際": 954,
"隠": 955,
"雁": 956,
"集": 957,
"雌": 958,
"離": 959,
"難": 960,
"需": 961,
"霍": 962,
"霸": 963,
"非": 964,
"靭": 965,
"靳": 966,
"鞘": 967,
"鞬": 968,
"鞭": 969,
"頗": 970,
"頸": 971,
"顆": 972,
"顒": 973,
"顥": 974,
"顧": 975,
"駅": 976,
"駈": 977,
"驍": 978,
"驕": 979,
"驛": 980,
"驟": 981,
"驢": 982,
"髄": 983,
"髢": 984,
"魂": 985,
"魃": 986,
"魍": 987,
"鯨": 988,
"鰐": 989,
"鳳": 990,
"鴈": 991,
"鵄": 992,
"鵠": 993,
"鵲": 994,
"鶚": 995,
"鷺": 996,
"鸕": 997,
"鸚": 998,
"鸞": 999,
"麦": 1000,
"鼠": 1001,
"鼾": 1002,
"齟": 1003,
"齬": 1004,
"##ᅥ": 1005,
"##ᆼ": 1006,
"##ᅱ": 1007,
"##ᆫ": 1008,
"##ᅢ": 1009,
"##ᅮ": 1010,
"##ᆨ": 1011,
"##ᅡ": 1012,
"##ᅩ": 1013,
"##ᅵ": 1014,
"##ᅧ": 1015,
"##ᅴ": 1016,
"##ᅳ": 1017,
"##ᆯ": 1018,
"##ᅪ": 1019,
"##ᅦ": 1020,
"##ᅭ": 1021,
"##ᆷ": 1022,
"##ᅲ": 1023,
"##ᆸ": 1024,
"##n": 1025,
"##u": 1026,
"##s": 1027,
"##e": 1028,
"##d": 1029,
"##1": 1030,
"##l": 1031,
"##a": 1032,
"##k": 1033,
"##6": 1034,
"##5": 1035,
"##3": 1036,
"##ᅯ": 1037,
"##8": 1038,
"##ᅣ": 1039,
"##9": 1040,
"##7": 1041,
"##0": 1042,
"##4": 1043,
"##p": 1044,
"##2": 1045,
"un": 1046,
"##us": 1047,
"##ed": 1048,
"unus": 1049,
"unused": 1050,
"##ᅡᆫ": 1051,
"##ᅡᆼ": 1052,
"##ᅩᆼ": 1053,
"##ᅮᆫ": 1054,
"##ᅥᆫ": 1055,
"##ᅧᆼ": 1056,
"으": 1057,
"하": 1058,
"##ᅵᆫ": 1059,
"##ᅧᆫ": 1060,
"기": 1061,
"그": 1062,
"과": 1063,
"나": 1064,
"서": 1065,
"스": 1066,
"야": 1067,
"화": 1068,
"##ᅧᆨ": 1069,
"거": 1070,
"구": 1071,
"고": 1072,
"도": 1073,
"드": 1074,
"리": 1075,
"무": 1076,
"마": 1077,
"버": 1078,
"부": 1079,
"바": 1080,
"사": 1081,
"시": 1082,
"우": 1083,
"오": 1084,
"이": 1085,
"와": 1086,
"요": 1087,
"유": 1088,
"저": 1089,
"주": 1090,
"지": 1091,
"해": 1092,
"##ᅯᆫ": 1093
}
}
}