Spaces:

ivanusto
/

tw-hakka-tts

Running on Zero

App Files Files Community

txya900619 commited on Apr 24

Commit

fb0bb2b

•

1 Parent(s): 9b0e12a

feat: let user can input pinyin

Browse files

Files changed (2) hide show

configs/ipa.yaml +1 -0
ipa/ipa.py +80 -30

configs/ipa.yaml CHANGED Viewed

@@ -3,6 +3,7 @@ delimiter_list: ${gh_download:FormoSpeech/FormoLexicon, release/delimiters.json,
 replace_dict: ${gh_download:FormoSpeech/FormoLexicon, release/replaced_words_htia.json, ${gh_token}}
 v2f_dict: ${gh_download:FormoSpeech/FormoLexicon, [release/v2f_goyu.json, release/v2f_htia.json], ${gh_token}}
 preserved_list: ${gh_download:FormoSpeech/FormoLexicon, release/preserved_words_htia.json, ${gh_token}}
 lexicon:
   sixian: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_sixian_c.json, ${gh_token}}
   hailu: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_hailu_c.json, ${gh_token}}

 replace_dict: ${gh_download:FormoSpeech/FormoLexicon, release/replaced_words_htia.json, ${gh_token}}
 v2f_dict: ${gh_download:FormoSpeech/FormoLexicon, [release/v2f_goyu.json, release/v2f_htia.json], ${gh_token}}
 preserved_list: ${gh_download:FormoSpeech/FormoLexicon, release/preserved_words_htia.json, ${gh_token}}
+pinyin_to_ipa_dict: ${gh_download:FormoSpeech/FormoLexicon, release/pinyin_to_ipa_htia.json, ${gh_token}}
 lexicon:
   sixian: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_sixian_c.json, ${gh_token}}
   hailu: ${gh_download:FormoSpeech/FormoLexicon, release/lexicon_htia_hailu_c.json, ${gh_token}}

ipa/ipa.py CHANGED Viewed

@@ -21,51 +21,53 @@ delimiter_regex, replace_regex, v2f_regex = prep_regex(
     ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"]
 )
 def get_ipa(raw_text, dialect):
-    lexicon = ipa_configs["lexicon"][dialect]
-    update_jieba_dict(
-    list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt"
     )
-    text = normalize_text(raw_text, ipa_configs["replace_dict"], replace_regex)
-    text = parse_num(text)
-    text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()]
-    text = "，".join(text_parts)
-    word_list = run_jieba(text)
-    word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex)
-    word_list = run_jieba("".join(word_list))
     final_words = []
     final_pinyin = []
     final_ipa = []
-    missing_words = []
-    for word in word_list:
-        if not bool(word.strip()):
             continue
-        if word == "，":
-            final_words.append("，")
-            final_pinyin.append("，")
-            final_ipa.append("，")
-        elif word not in lexicon:
-            final_words.append(word)
-            missing_words.append(word)
         else:
-            final_words.append(f"{word}")
-            final_pinyin.append(lexicon[word]['pinyin'][0])
-            # NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮
-            final_ipa.append(lexicon[word]['ipa'][0].replace(" ", "-"))
-    if len(final_ipa) == 0 or len(missing_words) > 0:
         return final_words, final_ipa, final_pinyin, missing_words
     final_words = " ".join(final_words).replace(" ， ", "，")
     final_ipa = " ".join(final_ipa).replace(" ， ", "，")
     final_pinyin = " ".join(final_pinyin).replace(" ， ", "，")
-    return final_words, final_ipa, final_pinyin, missing_words
 def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space=""):
     text = []
     ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa)
     print(ipa_list)
     for word in ipa_list:
@@ -76,8 +78,56 @@ def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space=""):
                 word = re.sub(r"[{}]".format(as_space), " ", word)
             if len(delete_chars) > 0:
                 word = re.sub(r"[{}]".format(delete_chars), "", word)
             word = word.replace("，", " ， ")
             text.extend(word)
-    return text

     ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"]
 )
 def get_ipa(raw_text, dialect):
+    pinyin_split = re.split(
+        r"(?<![\da-z])(?=[\da-z])|(?<=[\da-z])(?![\da-z])", raw_text
     )
     final_words = []
     final_pinyin = []
     final_ipa = []
+    final_missing_words = []
+    for hanzi_or_pinyin in pinyin_split:
+        if len(hanzi_or_pinyin.strip()) == 0:
             continue
+        if re.search(r"[\da-z]", hanzi_or_pinyin):
+            final_words.append(hanzi_or_pinyin)
+            final_pinyin.append(hanzi_or_pinyin)
+            pinyin, tone = re.match(r"([a-z]+)(\d+)?", hanzi_or_pinyin).groups()
+            tone = f"_{tone}" if tone else ""
+            ipa = parse_pinyin_to_ipa(pinyin)
+            if ipa is None:
+                final_missing_words.append(pinyin)
+                continue
+            final_ipa.append(ipa + tone)
         else:
+            words, ipa, pinyin, missing_words = parse_hanzi_to_ipa(hanzi_or_pinyin, dialect)
+            final_words.extend(words)
+            final_ipa.extend(ipa)
+            final_pinyin.extend(pinyin)
+            final_missing_words.extend(missing_words)
+    if len(final_ipa) == 0 or len(final_missing_words) > 0:
         return final_words, final_ipa, final_pinyin, missing_words
     final_words = " ".join(final_words).replace(" ， ", "，")
     final_ipa = " ".join(final_ipa).replace(" ， ", "，")
     final_pinyin = " ".join(final_pinyin).replace(" ， ", "，")
+    return final_words, final_ipa, final_pinyin, final_missing_words
 def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space=""):
     text = []
     ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa)
     print(ipa_list)
     for word in ipa_list:
                 word = re.sub(r"[{}]".format(as_space), " ", word)
             if len(delete_chars) > 0:
                 word = re.sub(r"[{}]".format(delete_chars), "", word)
             word = word.replace("，", " ， ")
             text.extend(word)
+    return text
+def parse_pinyin_to_ipa(pinyin: str):
+    if pinyin not in ipa_configs["pinyin_to_ipa_dict"]:
+        return None
+    ipa_dict_result = ipa_configs["pinyin_to_ipa_dict"][pinyin]
+    ipa = "+".join(ipa_dict_result).replace(" ", "-")
+    return ipa
+def parse_hanzi_to_ipa(
+    hanzi: str, dialect: str
+) -> tuple[list[str], list[str], list[str], list[str]]:
+    lexicon = ipa_configs["lexicon"][dialect]
+    update_jieba_dict(
+        list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt"
+    )
+    text = normalize_text(hanzi, ipa_configs["replace_dict"], replace_regex)
+    text = parse_num(text)
+    text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()]
+    text = "，".join(text_parts)
+    word_list = run_jieba(text)
+    word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex)
+    word_list = run_jieba("".join(word_list))
+    final_words = []
+    final_pinyin = []
+    final_ipa = []
+    missing_words = []
+    for word in word_list:
+        if not bool(word.strip()):
+            continue
+        if word == "，":
+            final_words.append("，")
+            final_pinyin.append("，")
+            final_ipa.append("，")
+        elif word not in lexicon:
+            final_words.append(word)
+            missing_words.append(word)
+        else:
+            final_words.append(f"{word}")
+            final_pinyin.append(lexicon[word]["pinyin"][0])
+            # NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮
+            final_ipa.append(lexicon[word]["ipa"][0])
+    return final_words, final_ipa, final_pinyin, missing_words