Spaces:
Running
on
Zero
Running
on
Zero
txya900619
commited on
Commit
•
fb0bb2b
1
Parent(s):
9b0e12a
feat: let user can input pinyin
Browse files- configs/ipa.yaml +1 -0
- ipa/ipa.py +80 -30
configs/ipa.yaml
CHANGED
@@ -3,6 +3,7 @@ delimiter_list: ${gh_download:FormoSpeech/FormoLexicon, release/delimiters.json,
|
|
3 |
replace_dict: ${gh_download:FormoSpeech/FormoLexicon, release/replaced_words_htia.json, ${gh_token}}
|
4 |
v2f_dict: ${gh_download:FormoSpeech/FormoLexicon, [release/v2f_goyu.json, release/v2f_htia.json], ${gh_token}}
|
5 |
preserved_list: ${gh_download:FormoSpeech/FormoLexicon, release/preserved_words_htia.json, ${gh_token}}
|
|
|
6 |
lexicon:
|
7 |
sixian: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_sixian_c.json, ${gh_token}}
|
8 |
hailu: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_hailu_c.json, ${gh_token}}
|
|
|
3 |
replace_dict: ${gh_download:FormoSpeech/FormoLexicon, release/replaced_words_htia.json, ${gh_token}}
|
4 |
v2f_dict: ${gh_download:FormoSpeech/FormoLexicon, [release/v2f_goyu.json, release/v2f_htia.json], ${gh_token}}
|
5 |
preserved_list: ${gh_download:FormoSpeech/FormoLexicon, release/preserved_words_htia.json, ${gh_token}}
|
6 |
+
pinyin_to_ipa_dict: ${gh_download:FormoSpeech/FormoLexicon, release/pinyin_to_ipa_htia.json, ${gh_token}}
|
7 |
lexicon:
|
8 |
sixian: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_sixian_c.json, ${gh_token}}
|
9 |
hailu: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_hailu_c.json, ${gh_token}}
|
ipa/ipa.py
CHANGED
@@ -21,51 +21,53 @@ delimiter_regex, replace_regex, v2f_regex = prep_regex(
|
|
21 |
ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"]
|
22 |
)
|
23 |
|
|
|
24 |
def get_ipa(raw_text, dialect):
|
25 |
-
|
26 |
-
|
27 |
-
list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt"
|
28 |
)
|
29 |
-
text = normalize_text(raw_text, ipa_configs["replace_dict"], replace_regex)
|
30 |
-
text = parse_num(text)
|
31 |
-
text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()]
|
32 |
-
text = ",".join(text_parts)
|
33 |
-
word_list = run_jieba(text)
|
34 |
-
word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex)
|
35 |
-
word_list = run_jieba("".join(word_list))
|
36 |
|
37 |
final_words = []
|
38 |
final_pinyin = []
|
39 |
final_ipa = []
|
40 |
-
|
41 |
-
for
|
42 |
-
if
|
43 |
continue
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
else:
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
56 |
|
57 |
-
if len(final_ipa) == 0 or len(
|
58 |
return final_words, final_ipa, final_pinyin, missing_words
|
59 |
|
60 |
final_words = " ".join(final_words).replace(" , ", ",")
|
61 |
final_ipa = " ".join(final_ipa).replace(" , ", ",")
|
62 |
final_pinyin = " ".join(final_pinyin).replace(" , ", ",")
|
63 |
-
|
64 |
-
return final_words, final_ipa, final_pinyin,
|
|
|
65 |
|
66 |
def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space=""):
|
67 |
text = []
|
68 |
-
|
69 |
ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa)
|
70 |
print(ipa_list)
|
71 |
for word in ipa_list:
|
@@ -76,8 +78,56 @@ def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space=""):
|
|
76 |
word = re.sub(r"[{}]".format(as_space), " ", word)
|
77 |
if len(delete_chars) > 0:
|
78 |
word = re.sub(r"[{}]".format(delete_chars), "", word)
|
79 |
-
|
80 |
word = word.replace(",", " , ")
|
81 |
text.extend(word)
|
82 |
|
83 |
-
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"]
|
22 |
)
|
23 |
|
24 |
+
|
25 |
def get_ipa(raw_text, dialect):
|
26 |
+
pinyin_split = re.split(
|
27 |
+
r"(?<![\da-z])(?=[\da-z])|(?<=[\da-z])(?![\da-z])", raw_text
|
|
|
28 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
final_words = []
|
31 |
final_pinyin = []
|
32 |
final_ipa = []
|
33 |
+
final_missing_words = []
|
34 |
+
for hanzi_or_pinyin in pinyin_split:
|
35 |
+
if len(hanzi_or_pinyin.strip()) == 0:
|
36 |
continue
|
37 |
+
|
38 |
+
if re.search(r"[\da-z]", hanzi_or_pinyin):
|
39 |
+
final_words.append(hanzi_or_pinyin)
|
40 |
+
final_pinyin.append(hanzi_or_pinyin)
|
41 |
+
pinyin, tone = re.match(r"([a-z]+)(\d+)?", hanzi_or_pinyin).groups()
|
42 |
+
tone = f"_{tone}" if tone else ""
|
43 |
+
|
44 |
+
ipa = parse_pinyin_to_ipa(pinyin)
|
45 |
+
if ipa is None:
|
46 |
+
final_missing_words.append(pinyin)
|
47 |
+
continue
|
48 |
+
|
49 |
+
final_ipa.append(ipa + tone)
|
50 |
else:
|
51 |
+
words, ipa, pinyin, missing_words = parse_hanzi_to_ipa(hanzi_or_pinyin, dialect)
|
52 |
+
final_words.extend(words)
|
53 |
+
final_ipa.extend(ipa)
|
54 |
+
final_pinyin.extend(pinyin)
|
55 |
+
final_missing_words.extend(missing_words)
|
56 |
+
|
57 |
|
58 |
+
if len(final_ipa) == 0 or len(final_missing_words) > 0:
|
59 |
return final_words, final_ipa, final_pinyin, missing_words
|
60 |
|
61 |
final_words = " ".join(final_words).replace(" , ", ",")
|
62 |
final_ipa = " ".join(final_ipa).replace(" , ", ",")
|
63 |
final_pinyin = " ".join(final_pinyin).replace(" , ", ",")
|
64 |
+
|
65 |
+
return final_words, final_ipa, final_pinyin, final_missing_words
|
66 |
+
|
67 |
|
68 |
def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space=""):
|
69 |
text = []
|
70 |
+
|
71 |
ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa)
|
72 |
print(ipa_list)
|
73 |
for word in ipa_list:
|
|
|
78 |
word = re.sub(r"[{}]".format(as_space), " ", word)
|
79 |
if len(delete_chars) > 0:
|
80 |
word = re.sub(r"[{}]".format(delete_chars), "", word)
|
81 |
+
|
82 |
word = word.replace(",", " , ")
|
83 |
text.extend(word)
|
84 |
|
85 |
+
return text
|
86 |
+
|
87 |
+
|
88 |
+
def parse_pinyin_to_ipa(pinyin: str):
|
89 |
+
if pinyin not in ipa_configs["pinyin_to_ipa_dict"]:
|
90 |
+
return None
|
91 |
+
|
92 |
+
ipa_dict_result = ipa_configs["pinyin_to_ipa_dict"][pinyin]
|
93 |
+
ipa = "+".join(ipa_dict_result).replace(" ", "-")
|
94 |
+
return ipa
|
95 |
+
|
96 |
+
|
97 |
+
def parse_hanzi_to_ipa(
|
98 |
+
hanzi: str, dialect: str
|
99 |
+
) -> tuple[list[str], list[str], list[str], list[str]]:
|
100 |
+
lexicon = ipa_configs["lexicon"][dialect]
|
101 |
+
update_jieba_dict(
|
102 |
+
list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt"
|
103 |
+
)
|
104 |
+
|
105 |
+
text = normalize_text(hanzi, ipa_configs["replace_dict"], replace_regex)
|
106 |
+
text = parse_num(text)
|
107 |
+
text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()]
|
108 |
+
text = ",".join(text_parts)
|
109 |
+
word_list = run_jieba(text)
|
110 |
+
word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex)
|
111 |
+
word_list = run_jieba("".join(word_list))
|
112 |
+
|
113 |
+
final_words = []
|
114 |
+
final_pinyin = []
|
115 |
+
final_ipa = []
|
116 |
+
missing_words = []
|
117 |
+
for word in word_list:
|
118 |
+
if not bool(word.strip()):
|
119 |
+
continue
|
120 |
+
if word == ",":
|
121 |
+
final_words.append(",")
|
122 |
+
final_pinyin.append(",")
|
123 |
+
final_ipa.append(",")
|
124 |
+
elif word not in lexicon:
|
125 |
+
final_words.append(word)
|
126 |
+
missing_words.append(word)
|
127 |
+
else:
|
128 |
+
final_words.append(f"{word}")
|
129 |
+
final_pinyin.append(lexicon[word]["pinyin"][0])
|
130 |
+
# NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮
|
131 |
+
final_ipa.append(lexicon[word]["ipa"][0])
|
132 |
+
|
133 |
+
return final_words, final_ipa, final_pinyin, missing_words
|