txya900619 commited on
Commit
fb0bb2b
1 Parent(s): 9b0e12a

feat: let user can input pinyin

Browse files
Files changed (2) hide show
  1. configs/ipa.yaml +1 -0
  2. ipa/ipa.py +80 -30
configs/ipa.yaml CHANGED
@@ -3,6 +3,7 @@ delimiter_list: ${gh_download:FormoSpeech/FormoLexicon, release/delimiters.json,
3
  replace_dict: ${gh_download:FormoSpeech/FormoLexicon, release/replaced_words_htia.json, ${gh_token}}
4
  v2f_dict: ${gh_download:FormoSpeech/FormoLexicon, [release/v2f_goyu.json, release/v2f_htia.json], ${gh_token}}
5
  preserved_list: ${gh_download:FormoSpeech/FormoLexicon, release/preserved_words_htia.json, ${gh_token}}
 
6
  lexicon:
7
  sixian: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_sixian_c.json, ${gh_token}}
8
  hailu: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_hailu_c.json, ${gh_token}}
 
3
  replace_dict: ${gh_download:FormoSpeech/FormoLexicon, release/replaced_words_htia.json, ${gh_token}}
4
  v2f_dict: ${gh_download:FormoSpeech/FormoLexicon, [release/v2f_goyu.json, release/v2f_htia.json], ${gh_token}}
5
  preserved_list: ${gh_download:FormoSpeech/FormoLexicon, release/preserved_words_htia.json, ${gh_token}}
6
+ pinyin_to_ipa_dict: ${gh_download:FormoSpeech/FormoLexicon, release/pinyin_to_ipa_htia.json, ${gh_token}}
7
  lexicon:
8
  sixian: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_sixian_c.json, ${gh_token}}
9
  hailu: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_hailu_c.json, ${gh_token}}
ipa/ipa.py CHANGED
@@ -21,51 +21,53 @@ delimiter_regex, replace_regex, v2f_regex = prep_regex(
21
  ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"]
22
  )
23
 
 
24
  def get_ipa(raw_text, dialect):
25
- lexicon = ipa_configs["lexicon"][dialect]
26
- update_jieba_dict(
27
- list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt"
28
  )
29
- text = normalize_text(raw_text, ipa_configs["replace_dict"], replace_regex)
30
- text = parse_num(text)
31
- text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()]
32
- text = ",".join(text_parts)
33
- word_list = run_jieba(text)
34
- word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex)
35
- word_list = run_jieba("".join(word_list))
36
 
37
  final_words = []
38
  final_pinyin = []
39
  final_ipa = []
40
- missing_words = []
41
- for word in word_list:
42
- if not bool(word.strip()):
43
  continue
44
- if word == ",":
45
- final_words.append("")
46
- final_pinyin.append(",")
47
- final_ipa.append(",")
48
- elif word not in lexicon:
49
- final_words.append(word)
50
- missing_words.append(word)
 
 
 
 
 
 
51
  else:
52
- final_words.append(f"{word}")
53
- final_pinyin.append(lexicon[word]['pinyin'][0])
54
- # NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮
55
- final_ipa.append(lexicon[word]['ipa'][0].replace(" ", "-"))
 
 
56
 
57
- if len(final_ipa) == 0 or len(missing_words) > 0:
58
  return final_words, final_ipa, final_pinyin, missing_words
59
 
60
  final_words = " ".join(final_words).replace(" , ", ",")
61
  final_ipa = " ".join(final_ipa).replace(" , ", ",")
62
  final_pinyin = " ".join(final_pinyin).replace(" , ", ",")
63
-
64
- return final_words, final_ipa, final_pinyin, missing_words
 
65
 
66
  def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space=""):
67
  text = []
68
-
69
  ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa)
70
  print(ipa_list)
71
  for word in ipa_list:
@@ -76,8 +78,56 @@ def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space=""):
76
  word = re.sub(r"[{}]".format(as_space), " ", word)
77
  if len(delete_chars) > 0:
78
  word = re.sub(r"[{}]".format(delete_chars), "", word)
79
-
80
  word = word.replace(",", " , ")
81
  text.extend(word)
82
 
83
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"]
22
  )
23
 
24
+
25
  def get_ipa(raw_text, dialect):
26
+ pinyin_split = re.split(
27
+ r"(?<![\da-z])(?=[\da-z])|(?<=[\da-z])(?![\da-z])", raw_text
 
28
  )
 
 
 
 
 
 
 
29
 
30
  final_words = []
31
  final_pinyin = []
32
  final_ipa = []
33
+ final_missing_words = []
34
+ for hanzi_or_pinyin in pinyin_split:
35
+ if len(hanzi_or_pinyin.strip()) == 0:
36
  continue
37
+
38
+ if re.search(r"[\da-z]", hanzi_or_pinyin):
39
+ final_words.append(hanzi_or_pinyin)
40
+ final_pinyin.append(hanzi_or_pinyin)
41
+ pinyin, tone = re.match(r"([a-z]+)(\d+)?", hanzi_or_pinyin).groups()
42
+ tone = f"_{tone}" if tone else ""
43
+
44
+ ipa = parse_pinyin_to_ipa(pinyin)
45
+ if ipa is None:
46
+ final_missing_words.append(pinyin)
47
+ continue
48
+
49
+ final_ipa.append(ipa + tone)
50
  else:
51
+ words, ipa, pinyin, missing_words = parse_hanzi_to_ipa(hanzi_or_pinyin, dialect)
52
+ final_words.extend(words)
53
+ final_ipa.extend(ipa)
54
+ final_pinyin.extend(pinyin)
55
+ final_missing_words.extend(missing_words)
56
+
57
 
58
+ if len(final_ipa) == 0 or len(final_missing_words) > 0:
59
  return final_words, final_ipa, final_pinyin, missing_words
60
 
61
  final_words = " ".join(final_words).replace(" , ", ",")
62
  final_ipa = " ".join(final_ipa).replace(" , ", ",")
63
  final_pinyin = " ".join(final_pinyin).replace(" , ", ",")
64
+
65
+ return final_words, final_ipa, final_pinyin, final_missing_words
66
+
67
 
68
  def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space=""):
69
  text = []
70
+
71
  ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa)
72
  print(ipa_list)
73
  for word in ipa_list:
 
78
  word = re.sub(r"[{}]".format(as_space), " ", word)
79
  if len(delete_chars) > 0:
80
  word = re.sub(r"[{}]".format(delete_chars), "", word)
81
+
82
  word = word.replace(",", " , ")
83
  text.extend(word)
84
 
85
+ return text
86
+
87
+
88
+ def parse_pinyin_to_ipa(pinyin: str):
89
+ if pinyin not in ipa_configs["pinyin_to_ipa_dict"]:
90
+ return None
91
+
92
+ ipa_dict_result = ipa_configs["pinyin_to_ipa_dict"][pinyin]
93
+ ipa = "+".join(ipa_dict_result).replace(" ", "-")
94
+ return ipa
95
+
96
+
97
+ def parse_hanzi_to_ipa(
98
+ hanzi: str, dialect: str
99
+ ) -> tuple[list[str], list[str], list[str], list[str]]:
100
+ lexicon = ipa_configs["lexicon"][dialect]
101
+ update_jieba_dict(
102
+ list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt"
103
+ )
104
+
105
+ text = normalize_text(hanzi, ipa_configs["replace_dict"], replace_regex)
106
+ text = parse_num(text)
107
+ text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()]
108
+ text = ",".join(text_parts)
109
+ word_list = run_jieba(text)
110
+ word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex)
111
+ word_list = run_jieba("".join(word_list))
112
+
113
+ final_words = []
114
+ final_pinyin = []
115
+ final_ipa = []
116
+ missing_words = []
117
+ for word in word_list:
118
+ if not bool(word.strip()):
119
+ continue
120
+ if word == ",":
121
+ final_words.append(",")
122
+ final_pinyin.append(",")
123
+ final_ipa.append(",")
124
+ elif word not in lexicon:
125
+ final_words.append(word)
126
+ missing_words.append(word)
127
+ else:
128
+ final_words.append(f"{word}")
129
+ final_pinyin.append(lexicon[word]["pinyin"][0])
130
+ # NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮
131
+ final_ipa.append(lexicon[word]["ipa"][0])
132
+
133
+ return final_words, final_ipa, final_pinyin, missing_words