# | |
# Japanese charcter category map | |
# | |
# $Id: char.def 9 2012-12-12 04:13:15Z togiso $; | |
# | |
################################################################################### | |
# | |
# CHARACTER CATEGORY DEFINITION | |
# | |
# CATEGORY_NAME INVOKE GROUP LENGTH | |
# | |
# - CATEGORY_NAME: Name of category. you have to define DEFAULT class. | |
# - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon | |
# - GROUP: 1/0: make a new word by grouping the same chracter category | |
# - LENGTH: n: 1 to n length new words are added | |
# | |
DEFAULT 0 1 0 # DEFAULT is a mandatory category! | |
SPACE 0 1 0 | |
KANJI 0 0 2 | |
SYMBOL 1 1 0 | |
NUMERIC 1 1 0 | |
ALPHA 1 1 0 | |
HIRAGANA 0 1 2 | |
KATAKANA 1 1 2 | |
KANJINUMERIC 0 1 0 #change INVOKE 1->0 | |
GREEK 1 1 0 | |
CYRILLIC 1 1 0 | |
################################################################################### | |
# | |
# CODE(UCS2) TO CATEGORY MAPPING | |
# | |
# SPACE | |
0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE | |
0x00D0 SPACE | |
0x0009 SPACE | |
0x000B SPACE | |
0x000A SPACE | |
# ASCII | |
0x0021..0x002F SYMBOL #!"#$%&'()*+,-./ | |
0x0030..0x0039 NUMERIC #0-9 | |
0x003A..0x0040 SYMBOL #:;<=>?@ | |
0x0041..0x005A ALPHA #A-Z | |
0x005B..0x0060 SYMBOL #[\]^_` | |
0x0061..0x007A ALPHA #a-z | |
0x007B..0x007E SYMBOL #{|}~ | |
# Latin | |
0x00A1..0x00BF SYMBOL # Latin 1 #¡->ß | |
0x00C0..0x00FF ALPHA # Latin 1 #À->ÿ | |
0x0100..0x017F ALPHA # Latin Extended A | |
0x0180..0x0236 ALPHA # Latin Extended B | |
0x1E00..0x1EF9 ALPHA # Latin Extended Additional | |
# CYRILLIC | |
0x0400..0x04F9 CYRILLIC #Ѐ->ӹ | |
0x0500..0x050F CYRILLIC # Cyrillic supplementary | |
# GREEK | |
0x0374..0x03FB GREEK # Greek and Coptic #ʹ->ϻ | |
# HIRAGANA | |
0x3041..0x309F HIRAGANA | |
# KATAKANA | |
0x30A1..0x30FF KATAKANA | |
0x31F0..0x31FF KATAKANA # Small KU .. Small RO | |
# 0x30FC KATAKANA HIRAGANA # ー | |
0x30FC KATAKANA | |
# Half KATAKANA | |
0xFF66..0xFF9D KATAKANA | |
0xFF9E..0xFF9F KATAKANA | |
# KANJI | |
0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement | |
0x2F00..0x2FD5 KANJI | |
0x3005 KANJI | |
0x3007 KANJI | |
0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention | |
0x4E00..0x9FA5 KANJI | |
0xF900..0xFA2D KANJI | |
0xFA30..0xFA6A KANJI | |
# KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆) | |
0x4E00 KANJINUMERIC KANJI | |
0x4E8C KANJINUMERIC KANJI | |
0x4E09 KANJINUMERIC KANJI | |
0x56DB KANJINUMERIC KANJI | |
0x4E94 KANJINUMERIC KANJI | |
0x516D KANJINUMERIC KANJI | |
0x4E03 KANJINUMERIC KANJI | |
0x516B KANJINUMERIC KANJI | |
0x4E5D KANJINUMERIC KANJI | |
0x5341 KANJINUMERIC KANJI | |
0x767E KANJINUMERIC KANJI | |
0x5343 KANJINUMERIC KANJI | |
0x4E07 KANJINUMERIC KANJI | |
0x5104 KANJINUMERIC KANJI | |
0x5146 KANJINUMERIC KANJI | |
# ZENKAKU | |
0xFF10..0xFF19 NUMERIC | |
0xFF21..0xFF3A ALPHA | |
0xFF41..0xFF5A ALPHA | |
0xFF01..0xFF0F SYMBOL #!->/ | |
0xFF1A..0xFF20 SYMBOL #:->@ | |
0xFF3B..0xFF40 SYMBOL #[->` | |
0xFF5B..0xFF65 SYMBOL #{->・ | |
0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form | |
# OTHER SYMBOLS | |
0x2000..0x206F SYMBOL # General Punctuation | |
0x2070..0x209F NUMERIC # Superscripts and Subscripts | |
0x20A0..0x20CF SYMBOL # Currency Symbols | |
0x20D0..0x20FF SYMBOL # Combining Diaritical Marks for Symbols | |
0x2100..0x214F SYMBOL # Letterlike Symbols | |
0x2150..0x218F NUMERIC # Number forms | |
0x2100..0x214B SYMBOL # Letterlike Symbols | |
0x2190..0x21FF SYMBOL # Arrow | |
0x2200..0x22FF SYMBOL # Mathematical Operators | |
0x2300..0x23FF SYMBOL # Miscellaneuos Technical | |
0x2460..0x24FF SYMBOL # Enclosed NUMERICs | |
0x2501..0x257F SYMBOL # Box Drawing | |
0x2580..0x259F SYMBOL # Block Elements | |
0x25A0..0x25FF SYMBOL # Geometric Shapes | |
0x2600..0x26FE SYMBOL # Miscellaneous Symbols | |
0x2700..0x27BF SYMBOL # Dingbats | |
0x27F0..0x27FF SYMBOL # Supplemental Arrows A | |
0x27C0..0x27EF SYMBOL # Miscellaneous Mathematical Symbols-A | |
0x2800..0x28FF SYMBOL # Braille Patterns | |
0x2900..0x297F SYMBOL # Supplemental Arrows B | |
0x2B00..0x2BFF SYMBOL # Miscellaneous Symbols and Arrows | |
0x2A00..0x2AFF SYMBOL # Supplemental Mathematical Operators | |
0x3300..0x33FF SYMBOL | |
0x3200..0x32FE SYMBOL # ENclosed CJK Letters and Months | |
0x3000..0x303F SYMBOL # CJK Symbol and Punctuation | |
0xFE30..0xFE4F SYMBOL # CJK Compatibility Forms | |
0xFE50..0xFE6B SYMBOL # Small Form Variants | |
# added 2006/3/13 | |
0x3007 SYMBOL KANJINUMERIC | |
# END OF TABLE | |