Flux9665 commited on
Commit
1d10354
1 Parent(s): 355d903

Update Preprocessing/TextFrontend.py

Browse files
Files changed (1) hide show
  1. Preprocessing/TextFrontend.py +44 -4
Preprocessing/TextFrontend.py CHANGED
@@ -4,6 +4,7 @@
4
  import json
5
  import logging
6
  import re
 
7
 
8
  import torch
9
  from dragonmapper.transcriptions import pinyin_to_ipa
@@ -848,7 +849,7 @@ class ArticulatoryCombinedTextFrontend:
848
  # languages use different tones denoted by different numbering
849
  # systems. At this point in the script, it is attempted to unify
850
  # them all to the tones in the IPA standard.
851
- if self.g2p_lang == "vi":
852
  phones = phones.replace('1', "˧")
853
  phones = phones.replace('2', "˨˩")
854
  phones = phones.replace('ɜ', "˧˥") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
@@ -1052,11 +1053,49 @@ def english_text_expansion(text):
1052
  _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
1053
  [('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
1054
  ('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
1055
- ('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
1056
  for regex, replacement in _abbreviations:
1057
  text = re.sub(regex, replacement, text)
1058
  return text
1059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1060
 
1061
  def remove_french_spacing(text):
1062
  text = text.replace(" »", '"').replace("« ", '"')
@@ -1066,6 +1105,7 @@ def remove_french_spacing(text):
1066
 
1067
 
1068
  def convert_kanji_to_pinyin_mandarin(text):
 
1069
  return " ".join([x[0] for x in pinyin(text)])
1070
 
1071
 
@@ -1074,7 +1114,7 @@ def get_language_id(language):
1074
  iso_codes_to_ids = load_json_from_path("Preprocessing/multilinguality/iso_lookup.json")[-1]
1075
  except FileNotFoundError:
1076
  try:
1077
- iso_codes_to_ids = load_json_from_path("multilinguality/iso_lookup.json")[-1]
1078
  except FileNotFoundError:
1079
  iso_codes_to_ids = load_json_from_path("iso_lookup.json")[-1]
1080
  if language not in iso_codes_to_ids:
@@ -1090,7 +1130,7 @@ if __name__ == '__main__':
1090
 
1091
  print("\n\nChinese Test")
1092
  tf = ArticulatoryCombinedTextFrontend(language="cmn")
1093
- tf.string_to_tensor("这是一个复杂的句子,它甚至包含一个停顿。", view=True)
1094
  tf.string_to_tensor("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。", view=True)
1095
  tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
1096
 
 
4
  import json
5
  import logging
6
  import re
7
+ from pathlib import Path
8
 
9
  import torch
10
  from dragonmapper.transcriptions import pinyin_to_ipa
 
849
  # languages use different tones denoted by different numbering
850
  # systems. At this point in the script, it is attempted to unify
851
  # them all to the tones in the IPA standard.
852
+ if self.g2p_lang == "vi" or self.g2p_lang == "vi-vn-x-central" or self.g2p_lang == "vi-vn-x-south":
853
  phones = phones.replace('1', "˧")
854
  phones = phones.replace('2', "˨˩")
855
  phones = phones.replace('ɜ', "˧˥") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
 
1053
  _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
1054
  [('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
1055
  ('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
1056
+ ('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort'), ('e.g.', ', for example, '), ('TTS', 'text to speech')]]
1057
  for regex, replacement in _abbreviations:
1058
  text = re.sub(regex, replacement, text)
1059
  return text
1060
 
1061
+ def chinese_number_conversion(text):
1062
+ # https://gist.github.com/gumblex/0d65cad2ba607fd14de7?permalink_comment_id=4063512#gistcomment-4063512
1063
+ import bisect
1064
+ zhdigits = '零一二三四五六七八九'
1065
+ zhplaces = {
1066
+ 0: '',
1067
+ 1: '十',
1068
+ 2: '百',
1069
+ 3: '千',
1070
+ 4: '万',
1071
+ 8: '亿',
1072
+ }
1073
+ zhplace_keys = sorted(zhplaces.keys())
1074
+
1075
+ def numdigits(n):
1076
+ return len(str(abs(n)))
1077
+
1078
+ def _zhnum(n):
1079
+ if n < 10:
1080
+ return zhdigits[n]
1081
+ named_place_len = zhplace_keys[bisect.bisect_right(zhplace_keys,
1082
+ numdigits(n) - 1) - 1]
1083
+ left_part, right_part = n // 10 ** named_place_len, n % 10 ** named_place_len
1084
+ return (_zhnum(left_part) +
1085
+ zhplaces[named_place_len] +
1086
+ ((zhdigits[0] if numdigits(right_part) != named_place_len else '') +
1087
+ _zhnum(right_part)
1088
+ if right_part else ''))
1089
+
1090
+ def zhnum(n):
1091
+ answer = ('负' if n < 0 else '') + _zhnum(abs(n))
1092
+ answer = re.sub(r'^一十', '十', answer)
1093
+ answer = re.sub(r'(?<![零十])二(?=[千万亿])', r'两', answer)
1094
+ return answer
1095
+
1096
+
1097
+ return re.sub(r'\d+', lambda x: zhnum(int(x.group())), text)
1098
+
1099
 
1100
  def remove_french_spacing(text):
1101
  text = text.replace(" »", '"').replace("« ", '"')
 
1105
 
1106
 
1107
  def convert_kanji_to_pinyin_mandarin(text):
1108
+ text = chinese_number_conversion(text)
1109
  return " ".join([x[0] for x in pinyin(text)])
1110
 
1111
 
 
1114
  iso_codes_to_ids = load_json_from_path("Preprocessing/multilinguality/iso_lookup.json")[-1]
1115
  except FileNotFoundError:
1116
  try:
1117
+ iso_codes_to_ids = load_json_from_path(str(Path(__file__).parent / "multilinguality/iso_lookup.json"))[-1]
1118
  except FileNotFoundError:
1119
  iso_codes_to_ids = load_json_from_path("iso_lookup.json")[-1]
1120
  if language not in iso_codes_to_ids:
 
1130
 
1131
  print("\n\nChinese Test")
1132
  tf = ArticulatoryCombinedTextFrontend(language="cmn")
1133
+ tf.string_to_tensor("这是一个复杂的句子,19423 它甚至包含一个停顿。", view=True)
1134
  tf.string_to_tensor("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。", view=True)
1135
  tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
1136