Spaces:
Running
on
T4
Running
on
T4
Update Preprocessing/TextFrontend.py
Browse files
Preprocessing/TextFrontend.py
CHANGED
@@ -4,6 +4,7 @@
|
|
4 |
import json
|
5 |
import logging
|
6 |
import re
|
|
|
7 |
|
8 |
import torch
|
9 |
from dragonmapper.transcriptions import pinyin_to_ipa
|
@@ -848,7 +849,7 @@ class ArticulatoryCombinedTextFrontend:
|
|
848 |
# languages use different tones denoted by different numbering
|
849 |
# systems. At this point in the script, it is attempted to unify
|
850 |
# them all to the tones in the IPA standard.
|
851 |
-
if self.g2p_lang == "vi":
|
852 |
phones = phones.replace('1', "˧")
|
853 |
phones = phones.replace('2', "˨˩")
|
854 |
phones = phones.replace('ɜ', "˧˥") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
|
@@ -1052,11 +1053,49 @@ def english_text_expansion(text):
|
|
1052 |
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
|
1053 |
[('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
|
1054 |
('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
|
1055 |
-
('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
|
1056 |
for regex, replacement in _abbreviations:
|
1057 |
text = re.sub(regex, replacement, text)
|
1058 |
return text
|
1059 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1060 |
|
1061 |
def remove_french_spacing(text):
|
1062 |
text = text.replace(" »", '"').replace("« ", '"')
|
@@ -1066,6 +1105,7 @@ def remove_french_spacing(text):
|
|
1066 |
|
1067 |
|
1068 |
def convert_kanji_to_pinyin_mandarin(text):
|
|
|
1069 |
return " ".join([x[0] for x in pinyin(text)])
|
1070 |
|
1071 |
|
@@ -1074,7 +1114,7 @@ def get_language_id(language):
|
|
1074 |
iso_codes_to_ids = load_json_from_path("Preprocessing/multilinguality/iso_lookup.json")[-1]
|
1075 |
except FileNotFoundError:
|
1076 |
try:
|
1077 |
-
iso_codes_to_ids = load_json_from_path("multilinguality/iso_lookup.json")[-1]
|
1078 |
except FileNotFoundError:
|
1079 |
iso_codes_to_ids = load_json_from_path("iso_lookup.json")[-1]
|
1080 |
if language not in iso_codes_to_ids:
|
@@ -1090,7 +1130,7 @@ if __name__ == '__main__':
|
|
1090 |
|
1091 |
print("\n\nChinese Test")
|
1092 |
tf = ArticulatoryCombinedTextFrontend(language="cmn")
|
1093 |
-
tf.string_to_tensor("
|
1094 |
tf.string_to_tensor("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。", view=True)
|
1095 |
tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
|
1096 |
|
|
|
4 |
import json
|
5 |
import logging
|
6 |
import re
|
7 |
+
from pathlib import Path
|
8 |
|
9 |
import torch
|
10 |
from dragonmapper.transcriptions import pinyin_to_ipa
|
|
|
849 |
# languages use different tones denoted by different numbering
|
850 |
# systems. At this point in the script, it is attempted to unify
|
851 |
# them all to the tones in the IPA standard.
|
852 |
+
if self.g2p_lang == "vi" or self.g2p_lang == "vi-vn-x-central" or self.g2p_lang == "vi-vn-x-south":
|
853 |
phones = phones.replace('1', "˧")
|
854 |
phones = phones.replace('2', "˨˩")
|
855 |
phones = phones.replace('ɜ', "˧˥") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
|
|
|
1053 |
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
|
1054 |
[('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
|
1055 |
('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
|
1056 |
+
('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort'), ('e.g.', ', for example, '), ('TTS', 'text to speech')]]
|
1057 |
for regex, replacement in _abbreviations:
|
1058 |
text = re.sub(regex, replacement, text)
|
1059 |
return text
|
1060 |
|
1061 |
+
def chinese_number_conversion(text):
|
1062 |
+
# https://gist.github.com/gumblex/0d65cad2ba607fd14de7?permalink_comment_id=4063512#gistcomment-4063512
|
1063 |
+
import bisect
|
1064 |
+
zhdigits = '零一二三四五六七八九'
|
1065 |
+
zhplaces = {
|
1066 |
+
0: '',
|
1067 |
+
1: '十',
|
1068 |
+
2: '百',
|
1069 |
+
3: '千',
|
1070 |
+
4: '万',
|
1071 |
+
8: '亿',
|
1072 |
+
}
|
1073 |
+
zhplace_keys = sorted(zhplaces.keys())
|
1074 |
+
|
1075 |
+
def numdigits(n):
|
1076 |
+
return len(str(abs(n)))
|
1077 |
+
|
1078 |
+
def _zhnum(n):
|
1079 |
+
if n < 10:
|
1080 |
+
return zhdigits[n]
|
1081 |
+
named_place_len = zhplace_keys[bisect.bisect_right(zhplace_keys,
|
1082 |
+
numdigits(n) - 1) - 1]
|
1083 |
+
left_part, right_part = n // 10 ** named_place_len, n % 10 ** named_place_len
|
1084 |
+
return (_zhnum(left_part) +
|
1085 |
+
zhplaces[named_place_len] +
|
1086 |
+
((zhdigits[0] if numdigits(right_part) != named_place_len else '') +
|
1087 |
+
_zhnum(right_part)
|
1088 |
+
if right_part else ''))
|
1089 |
+
|
1090 |
+
def zhnum(n):
|
1091 |
+
answer = ('负' if n < 0 else '') + _zhnum(abs(n))
|
1092 |
+
answer = re.sub(r'^一十', '十', answer)
|
1093 |
+
answer = re.sub(r'(?<![零十])二(?=[千万亿])', r'两', answer)
|
1094 |
+
return answer
|
1095 |
+
|
1096 |
+
|
1097 |
+
return re.sub(r'\d+', lambda x: zhnum(int(x.group())), text)
|
1098 |
+
|
1099 |
|
1100 |
def remove_french_spacing(text):
|
1101 |
text = text.replace(" »", '"').replace("« ", '"')
|
|
|
1105 |
|
1106 |
|
1107 |
def convert_kanji_to_pinyin_mandarin(text):
|
1108 |
+
text = chinese_number_conversion(text)
|
1109 |
return " ".join([x[0] for x in pinyin(text)])
|
1110 |
|
1111 |
|
|
|
1114 |
iso_codes_to_ids = load_json_from_path("Preprocessing/multilinguality/iso_lookup.json")[-1]
|
1115 |
except FileNotFoundError:
|
1116 |
try:
|
1117 |
+
iso_codes_to_ids = load_json_from_path(str(Path(__file__).parent / "multilinguality/iso_lookup.json"))[-1]
|
1118 |
except FileNotFoundError:
|
1119 |
iso_codes_to_ids = load_json_from_path("iso_lookup.json")[-1]
|
1120 |
if language not in iso_codes_to_ids:
|
|
|
1130 |
|
1131 |
print("\n\nChinese Test")
|
1132 |
tf = ArticulatoryCombinedTextFrontend(language="cmn")
|
1133 |
+
tf.string_to_tensor("这是一个复杂的句子,19423 它甚至包含一个停顿。", view=True)
|
1134 |
tf.string_to_tensor("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。", view=True)
|
1135 |
tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
|
1136 |
|