# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List from typing import Tuple import jieba_fast as jieba from pypinyin import lazy_pinyin from pypinyin import Style class ToneSandhi: def __init__(self): self.must_neural_tone_words = { "麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头", "馄饨", "风筝", "难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖", "铃铛", "铁匠", "钥匙", "里脊", "里头", "部分", "那么", "道士", "造化", "迷糊", "连累", "这么", "这个", "运气", "过去", "软和", "转悠", "踏实", "跳蚤", "跟头", "趔趄", "财主", "豆腐", "讲究", "记性", "记号", "认识", "规矩", "见识", "裁缝", "补丁", "衣裳", "衣服", "衙门", "街坊", "行李", "行当", "蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄", "萝卜", "荸荠", "苗条", "苗头", "苍蝇", "芝麻", "舒服", "舒坦", "舌头", "自在", "膏药", "脾气", "脑袋", "脊梁", "能耐", "胳膊", "胭脂", "胡萝", "胡琴", "胡同", "聪明", "耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆", "老头", "老太", "翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂", "精神", "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语", "笑话", "窟窿", "窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气", "秀才", "福气", "祖宗", "砚台", "码头", "石榴", "石头", "石匠", "知识", "眼睛", "眯缝", "眨巴", "眉毛", "相声", "盘算", "白净", "痢疾", "痛快", "疟疾", "疙瘩", "疏忽", "畜生", "生意", "甘蔗", "琵琶", "琢磨", "琉璃", "玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务", "牲口", "牙碜", "牌楼", "爽快", "爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心", "炊帚", "灯笼", "火候", "漂亮", "滑溜", "溜达", "温和", "清楚", "消息", "浪头", "活泼", "比方", "正经", "欺负", "模糊", "槟榔", "棺材", "棒槌", "棉花", "核桃", "栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事", "木头", "木匠", "朋友", "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾", "收成", "提防", "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄", "招牌", "招呼", "抬举", "护士", "折腾", "扫帚", "打量", "打算", "打点", "打扮", "打听", "打发", "扎实", "扁担", "戒指", "懒得", "意识", "意思", "情形", "悟性", "怪物", "思量", "怎么", "念头", "念叨", "快活", "忙活", "志气", "心思", "得罪", "张罗", "弟兄", "开通", "应酬", "庄稼", "干事", "帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌", "差事", "工夫", "岁数", "屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头", "对付", "寡妇", "家伙", "客气", "实在", "官司", "学问", "学生", "字号", "嫁妆", "媳妇", "媒人", "婆家", "娘家", "委屈", "姑娘", "姐夫", "妯娌", "妥当", "妖精", "奴才", "女婿", "头发", "太阳", "大爷", "大方", "大意", "大夫", "多少", "多么", "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴", "嘱咐", "嘟囔", "嘀咕", "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴", "哈欠", "哆嗦", "咳嗽", "和尚", "告诉", "告示", "含糊", "吓唬", "后头", "名字", "名堂", "合同", "吆喝", "叫唤", "口袋", "厚道", "厉害", "千斤", "包袱", "包涵", "匀称", "勤快", "动静", "动弹", "功夫", "力气", "前头", "刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析", "出息", "凑合", "凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟", "便宜", "使唤", "佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么", "人家", "亲戚", "亲家", "交情", "云彩", "事情", "买卖", "主意", "丫头", "丧气", "两口", "东西", "东家", "世故", "不由", "不在", "下水", "下巴", "上头", "上司", "丈夫", "丈人", "一辈", "那个", "菩萨", "父亲", "母亲", "咕噜", "邋遢", "费用", "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅", "幸福", "熟悉", "计划", "扑腾", "蜡烛", "姥爷", "照顾", "喉咙", "吉他", "弄堂", "蚂蚱", "凤凰", "拖沓", "寒碜", "糟蹋", "倒腾", "报复", "逻辑", "盘缠", "喽啰", "牢骚", "咖喱", "扫把", "惦记", } self.must_not_neural_tone_words = { "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎", "幺幺", "干嘛", "学子", "哈哈", "数数", "袅袅", "局地", "以下", "娃哈哈", "花花草草", "留得", "耕地", "想想", "熙熙", "攘攘", "卵子", "死死", "冉冉", "恳恳", "佼佼", "吵吵", "打打", "考考", "整整", "莘莘", "落地", "算子", "家家户户", "青青", } self.punc = ":,;。?!“”‘’':,;.?!" # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 # e.g. # word: "家里" # pos: "s" # finals: ['ia1', 'i3'] def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 for j, item in enumerate(word): if ( j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"} and word not in self.must_not_neural_tone_words ): finals[j] = finals[j][:-1] + "5" ge_idx = word.find("个") if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": finals[-1] = finals[-1][:-1] + "5" elif len(word) >= 1 and word[-1] in "的地得": finals[-1] = finals[-1][:-1] + "5" # e.g. 走了, 看着, 去过 elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: finals[-1] = finals[-1][:-1] + "5" elif ( len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words ): finals[-1] = finals[-1][:-1] + "5" # e.g. 桌上, 地下, 家里 elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: finals[-1] = finals[-1][:-1] + "5" # e.g. 上来, 下去 elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": finals[-1] = finals[-1][:-1] + "5" # 个做量词 elif ( ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是") ) or word == "个": finals[ge_idx] = finals[ge_idx][:-1] + "5" else: if ( word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words ): finals[-1] = finals[-1][:-1] + "5" word_list = self._split_word(word) finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] for i, word in enumerate(word_list): # conventional neural in Chinese if ( word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words ): finals_list[i][-1] = finals_list[i][-1][:-1] + "5" finals = sum(finals_list, []) return finals def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]: # e.g. 看不懂 if len(word) == 3 and word[1] == "不": finals[1] = finals[1][:-1] + "5" else: for i, char in enumerate(word): # "不" before tone4 should be bu2, e.g. 不怕 if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4": finals[i] = finals[i][:-1] + "2" return finals def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: # "一" in number sequences, e.g. 一零零, 二一零 if word.find("一") != -1 and all( [item.isnumeric() for item in word if item != "一"] ): return finals # "一" between reduplication words shold be yi5, e.g. 看一看 elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: finals[1] = finals[1][:-1] + "5" # when "一" is ordinal word, it should be yi1 elif word.startswith("第一"): finals[1] = finals[1][:-1] + "1" else: for i, char in enumerate(word): if char == "一" and i + 1 < len(word): # "一" before tone4 should be yi2, e.g. 一段 if finals[i + 1][-1] == "4": finals[i] = finals[i][:-1] + "2" # "一" before non-tone4 should be yi4, e.g. 一天 else: # "一" 后面如果是标点,还读一声 if word[i + 1] not in self.punc: finals[i] = finals[i][:-1] + "4" return finals def _split_word(self, word: str) -> List[str]: word_list = jieba.cut_for_search(word) word_list = sorted(word_list, key=lambda i: len(i), reverse=False) first_subword = word_list[0] first_begin_idx = word.find(first_subword) if first_begin_idx == 0: second_subword = word[len(first_subword) :] new_word_list = [first_subword, second_subword] else: second_subword = word[: -len(first_subword)] new_word_list = [second_subword, first_subword] return new_word_list def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: if len(word) == 2 and self._all_tone_three(finals): finals[0] = finals[0][:-1] + "2" elif len(word) == 3: word_list = self._split_word(word) if self._all_tone_three(finals): # disyllabic + monosyllabic, e.g. 蒙古/包 if len(word_list[0]) == 2: finals[0] = finals[0][:-1] + "2" finals[1] = finals[1][:-1] + "2" # monosyllabic + disyllabic, e.g. 纸/老虎 elif len(word_list[0]) == 1: finals[1] = finals[1][:-1] + "2" else: finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] if len(finals_list) == 2: for i, sub in enumerate(finals_list): # e.g. 所有/人 if self._all_tone_three(sub) and len(sub) == 2: finals_list[i][0] = finals_list[i][0][:-1] + "2" # e.g. 好/喜欢 elif ( i == 1 and not self._all_tone_three(sub) and finals_list[i][0][-1] == "3" and finals_list[0][-1][-1] == "3" ): finals_list[0][-1] = finals_list[0][-1][:-1] + "2" finals = sum(finals_list, []) # split idiom into two words who's length is 2 elif len(word) == 4: finals_list = [finals[:2], finals[2:]] finals = [] for sub in finals_list: if self._all_tone_three(sub): sub[0] = sub[0][:-1] + "2" finals += sub return finals def _all_tone_three(self, finals: List[str]) -> bool: return all(x[-1] == "3" for x in finals) # merge "不" and the word behind it # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] last_word = "" for word, pos in seg: if last_word == "不": word = last_word + word if word != "不": new_seg.append((word, pos)) last_word = word[:] if last_word == "不": new_seg.append((last_word, "d")) last_word = "" return new_seg # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听" # function 2: merge single "一" and the word behind it # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error # e.g. # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')] # output seg: [['听一听', 'v']] def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] # function 1 for i, (word, pos) in enumerate(seg): if ( i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v" and seg[i + 1][1] == "v" ): new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] else: if ( i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][0] == word and pos == "v" and seg[i - 2][1] == "v" ): continue else: new_seg.append([word, pos]) seg = new_seg new_seg = [] # function 2 for i, (word, pos) in enumerate(seg): if new_seg and new_seg[-1][0] == "一": new_seg[-1][0] = new_seg[-1][0] + word else: new_seg.append([word, pos]) return new_seg # the first and the second words are all_tone_three def _merge_continuous_three_tones( self, seg: List[Tuple[str, str]] ) -> List[Tuple[str, str]]: new_seg = [] sub_finals_list = [ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg ] assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) for i, (word, pos) in enumerate(seg): if ( i - 1 >= 0 and self._all_tone_three(sub_finals_list[i - 1]) and self._all_tone_three(sub_finals_list[i]) and not merge_last[i - 1] ): # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi if ( not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 ): new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: new_seg.append([word, pos]) else: new_seg.append([word, pos]) return new_seg def _is_reduplication(self, word: str) -> bool: return len(word) == 2 and word[0] == word[1] # the last char of first word and the first char of second word is tone_three def _merge_continuous_three_tones_2( self, seg: List[Tuple[str, str]] ) -> List[Tuple[str, str]]: new_seg = [] sub_finals_list = [ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg ] assert len(sub_finals_list) == len(seg) merge_last = [False] * len(seg) for i, (word, pos) in enumerate(seg): if ( i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not merge_last[i - 1] ): # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi if ( not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 ): new_seg[-1][0] = new_seg[-1][0] + seg[i][0] merge_last[i] = True else: new_seg.append([word, pos]) else: new_seg.append([word, pos]) return new_seg def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] for i, (word, pos) in enumerate(seg): if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#": new_seg[-1][0] = new_seg[-1][0] + seg[i][0] else: new_seg.append([word, pos]) return new_seg def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: new_seg = [] for i, (word, pos) in enumerate(seg): if new_seg and word == new_seg[-1][0]: new_seg[-1][0] = new_seg[-1][0] + seg[i][0] else: new_seg.append([word, pos]) return new_seg def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: seg = self._merge_bu(seg) try: seg = self._merge_yi(seg) except: print("_merge_yi failed") seg = self._merge_reduplication(seg) try: seg = self._merge_continuous_three_tones(seg) except: print("_merge_continuous_three_tones failed") try: seg = self._merge_continuous_three_tones_2(seg) except: print("_merge_continuous_three_tones_2 failed") seg = self._merge_er(seg) return seg def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: finals = self._bu_sandhi(word, finals) finals = self._yi_sandhi(word, finals) finals = self._neural_sandhi(word, pos, finals) finals = self._three_sandhi(word, finals) return finals