ElesisSiegherts commited on
Commit
c729232
1 Parent(s): a2521ef

Upload 15 files

Browse files
tools/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ 工具包
3
+ """
tools/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (160 Bytes). View file
 
tools/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (158 Bytes). View file
 
tools/__pycache__/classify_language.cpython-310.pyc ADDED
Binary file (2.78 kB). View file
 
tools/__pycache__/classify_language.cpython-38.pyc ADDED
Binary file (3.17 kB). View file
 
tools/__pycache__/log.cpython-310.pyc ADDED
Binary file (399 Bytes). View file
 
tools/__pycache__/log.cpython-38.pyc ADDED
Binary file (397 Bytes). View file
 
tools/__pycache__/sentence.cpython-310.pyc ADDED
Binary file (4.65 kB). View file
 
tools/__pycache__/sentence.cpython-38.pyc ADDED
Binary file (4.63 kB). View file
 
tools/__pycache__/translate.cpython-310.pyc ADDED
Binary file (1.6 kB). View file
 
tools/__pycache__/translate.cpython-38.pyc ADDED
Binary file (1.6 kB). View file
 
tools/classify_language.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import regex as re
2
+
3
+ try:
4
+ from config import config
5
+
6
+ LANGUAGE_IDENTIFICATION_LIBRARY = (
7
+ config.webui_config.language_identification_library
8
+ )
9
+ except:
10
+ LANGUAGE_IDENTIFICATION_LIBRARY = "langid"
11
+
12
+ module = LANGUAGE_IDENTIFICATION_LIBRARY.lower()
13
+
14
+ langid_languages = [
15
+ "af",
16
+ "am",
17
+ "an",
18
+ "ar",
19
+ "as",
20
+ "az",
21
+ "be",
22
+ "bg",
23
+ "bn",
24
+ "br",
25
+ "bs",
26
+ "ca",
27
+ "cs",
28
+ "cy",
29
+ "da",
30
+ "de",
31
+ "dz",
32
+ "el",
33
+ "en",
34
+ "eo",
35
+ "es",
36
+ "et",
37
+ "eu",
38
+ "fa",
39
+ "fi",
40
+ "fo",
41
+ "fr",
42
+ "ga",
43
+ "gl",
44
+ "gu",
45
+ "he",
46
+ "hi",
47
+ "hr",
48
+ "ht",
49
+ "hu",
50
+ "hy",
51
+ "id",
52
+ "is",
53
+ "it",
54
+ "ja",
55
+ "jv",
56
+ "ka",
57
+ "kk",
58
+ "km",
59
+ "kn",
60
+ "ko",
61
+ "ku",
62
+ "ky",
63
+ "la",
64
+ "lb",
65
+ "lo",
66
+ "lt",
67
+ "lv",
68
+ "mg",
69
+ "mk",
70
+ "ml",
71
+ "mn",
72
+ "mr",
73
+ "ms",
74
+ "mt",
75
+ "nb",
76
+ "ne",
77
+ "nl",
78
+ "nn",
79
+ "no",
80
+ "oc",
81
+ "or",
82
+ "pa",
83
+ "pl",
84
+ "ps",
85
+ "pt",
86
+ "qu",
87
+ "ro",
88
+ "ru",
89
+ "rw",
90
+ "se",
91
+ "si",
92
+ "sk",
93
+ "sl",
94
+ "sq",
95
+ "sr",
96
+ "sv",
97
+ "sw",
98
+ "ta",
99
+ "te",
100
+ "th",
101
+ "tl",
102
+ "tr",
103
+ "ug",
104
+ "uk",
105
+ "ur",
106
+ "vi",
107
+ "vo",
108
+ "wa",
109
+ "xh",
110
+ "zh",
111
+ "zu",
112
+ ]
113
+
114
+
115
+ def classify_language(text: str, target_languages: list = None) -> str:
116
+ if module == "fastlid" or module == "fasttext":
117
+ from fastlid import fastlid, supported_langs
118
+
119
+ classifier = fastlid
120
+ if target_languages != None:
121
+ target_languages = [
122
+ lang for lang in target_languages if lang in supported_langs
123
+ ]
124
+ fastlid.set_languages = target_languages
125
+ elif module == "langid":
126
+ import langid
127
+
128
+ classifier = langid.classify
129
+ if target_languages != None:
130
+ target_languages = [
131
+ lang for lang in target_languages if lang in langid_languages
132
+ ]
133
+ langid.set_languages(target_languages)
134
+ else:
135
+ raise ValueError(f"Wrong module {module}")
136
+
137
+ lang = classifier(text)[0]
138
+
139
+ return lang
140
+
141
+
142
+ def classify_zh_ja(text: str) -> str:
143
+ for idx, char in enumerate(text):
144
+ unicode_val = ord(char)
145
+
146
+ # 检测日语字符
147
+ if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF:
148
+ return "ja"
149
+
150
+ # 检测汉字字符
151
+ if 0x4E00 <= unicode_val <= 0x9FFF:
152
+ # 检查周围的字符
153
+ next_char = text[idx + 1] if idx + 1 < len(text) else None
154
+
155
+ if next_char and (
156
+ 0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF
157
+ ):
158
+ return "ja"
159
+
160
+ return "zh"
161
+
162
+
163
+ def split_alpha_nonalpha(text, mode=1):
164
+ if mode == 1:
165
+ pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF\d])(?=[\p{Latin}])|(?<=[\p{Latin}])(?=[\u4e00-\u9fff\u3040-\u30FF\d])"
166
+ elif mode == 2:
167
+ pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF])(?=[\p{Latin}\d])|(?<=[\p{Latin}\d])(?=[\u4e00-\u9fff\u3040-\u30FF])"
168
+ else:
169
+ raise ValueError("Invalid mode. Supported modes are 1 and 2.")
170
+
171
+ return re.split(pattern, text)
172
+
173
+
174
+ if __name__ == "__main__":
175
+ text = "这是一个测试文本"
176
+ print(classify_language(text))
177
+ print(classify_zh_ja(text)) # "zh"
178
+
179
+ text = "これはテストテキストです"
180
+ print(classify_language(text))
181
+ print(classify_zh_ja(text)) # "ja"
182
+
183
+ text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days"
184
+
185
+ print(split_alpha_nonalpha(text, mode=1))
186
+ # output: ['vits', '和', 'Bert-VITS', '2是', 'tts', '模型。花费3', 'days.花费3天。Take 3 days']
187
+
188
+ print(split_alpha_nonalpha(text, mode=2))
189
+ # output: ['vits', '和', 'Bert-VITS2', '是', 'tts', '模型。花费', '3days.花费', '3', '天。Take 3 days']
tools/log.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ logger封装
3
+ """
4
+ from loguru import logger
5
+ import sys
6
+
7
+
8
+ # 移除所有默认的处理器
9
+ logger.remove()
10
+
11
+ # 自定义格式并添加到标准输出
12
+ log_format = (
13
+ "<g>{time:MM-DD HH:mm:ss}</g> <lvl>{level:<9}</lvl>| {file}:{line} | {message}"
14
+ )
15
+
16
+ logger.add(sys.stdout, format=log_format, backtrace=True, diagnose=True)
tools/sentence.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import regex as re
4
+
5
+ from tools.classify_language import classify_language, split_alpha_nonalpha
6
+
7
+
8
+ def check_is_none(item) -> bool:
9
+ """none -> True, not none -> False"""
10
+ return (
11
+ item is None
12
+ or (isinstance(item, str) and str(item).isspace())
13
+ or str(item) == ""
14
+ )
15
+
16
+
17
+ def markup_language(text: str, target_languages: list = None) -> str:
18
+ pattern = (
19
+ r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
20
+ r"\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
21
+ r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
22
+ )
23
+ sentences = re.split(pattern, text)
24
+
25
+ pre_lang = ""
26
+ p = 0
27
+
28
+ if target_languages is not None:
29
+ sorted_target_languages = sorted(target_languages)
30
+ if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
31
+ new_sentences = []
32
+ for sentence in sentences:
33
+ new_sentences.extend(split_alpha_nonalpha(sentence))
34
+ sentences = new_sentences
35
+
36
+ for sentence in sentences:
37
+ if check_is_none(sentence):
38
+ continue
39
+
40
+ lang = classify_language(sentence, target_languages)
41
+
42
+ if pre_lang == "":
43
+ text = text[:p] + text[p:].replace(
44
+ sentence, f"[{lang.upper()}]{sentence}", 1
45
+ )
46
+ p += len(f"[{lang.upper()}]")
47
+ elif pre_lang != lang:
48
+ text = text[:p] + text[p:].replace(
49
+ sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1
50
+ )
51
+ p += len(f"[{pre_lang.upper()}][{lang.upper()}]")
52
+ pre_lang = lang
53
+ p += text[p:].index(sentence) + len(sentence)
54
+ text += f"[{pre_lang.upper()}]"
55
+
56
+ return text
57
+
58
+
59
+ def split_by_language(text: str, target_languages: list = None) -> list:
60
+ pattern = (
61
+ r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
62
+ r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
63
+ r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
64
+ )
65
+ sentences = re.split(pattern, text)
66
+
67
+ pre_lang = ""
68
+ start = 0
69
+ end = 0
70
+ sentences_list = []
71
+
72
+ if target_languages is not None:
73
+ sorted_target_languages = sorted(target_languages)
74
+ if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
75
+ new_sentences = []
76
+ for sentence in sentences:
77
+ new_sentences.extend(split_alpha_nonalpha(sentence))
78
+ sentences = new_sentences
79
+
80
+ for sentence in sentences:
81
+ if check_is_none(sentence):
82
+ continue
83
+
84
+ lang = classify_language(sentence, target_languages)
85
+
86
+ end += text[end:].index(sentence)
87
+ if pre_lang != "" and pre_lang != lang:
88
+ sentences_list.append((text[start:end], pre_lang))
89
+ start = end
90
+ end += len(sentence)
91
+ pre_lang = lang
92
+ sentences_list.append((text[start:], pre_lang))
93
+
94
+ return sentences_list
95
+
96
+
97
+ def sentence_split(text: str, max: int) -> list:
98
+ pattern = r"[!(),—+\-.:;??。,、;:]+"
99
+ sentences = re.split(pattern, text)
100
+ discarded_chars = re.findall(pattern, text)
101
+
102
+ sentences_list, count, p = [], 0, 0
103
+
104
+ # 按被分割的符号遍历
105
+ for i, discarded_chars in enumerate(discarded_chars):
106
+ count += len(sentences[i]) + len(discarded_chars)
107
+ if count >= max:
108
+ sentences_list.append(text[p : p + count].strip())
109
+ p += count
110
+ count = 0
111
+
112
+ # 加入最后剩余的文本
113
+ if p < len(text):
114
+ sentences_list.append(text[p:])
115
+
116
+ return sentences_list
117
+
118
+
119
+ def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None):
120
+ # 如果该speaker只支持一种语言
121
+ if speaker_lang is not None and len(speaker_lang) == 1:
122
+ if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
123
+ logging.debug(
124
+ f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}'
125
+ )
126
+ lang = speaker_lang[0]
127
+
128
+ sentences_list = []
129
+ if lang.upper() != "MIX":
130
+ if max <= 0:
131
+ sentences_list.append(
132
+ markup_language(text, speaker_lang)
133
+ if lang.upper() == "AUTO"
134
+ else f"[{lang.upper()}]{text}[{lang.upper()}]"
135
+ )
136
+ else:
137
+ for i in sentence_split(text, max):
138
+ if check_is_none(i):
139
+ continue
140
+ sentences_list.append(
141
+ markup_language(i, speaker_lang)
142
+ if lang.upper() == "AUTO"
143
+ else f"[{lang.upper()}]{i}[{lang.upper()}]"
144
+ )
145
+ else:
146
+ sentences_list.append(text)
147
+
148
+ for i in sentences_list:
149
+ logging.debug(i)
150
+
151
+ return sentences_list
152
+
153
+
154
+ if __name__ == "__main__":
155
+ text = "这几天心里颇不宁静。今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。"
156
+ print(markup_language(text, target_languages=None))
157
+ print(sentence_split(text, max=50))
158
+ print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None))
159
+
160
+ text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
161
+ print(split_by_language(text, ["zh", "ja", "en"]))
162
+
163
+ text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days"
164
+
165
+ print(split_by_language(text, ["zh", "ja", "en"]))
166
+ # output: [('vits', 'en'), ('和', 'ja'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
167
+
168
+ print(split_by_language(text, ["zh", "en"]))
169
+ # output: [('vits', 'en'), ('和', 'zh'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
tools/translate.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 翻译api
3
+ """
4
+ from config import config
5
+
6
+ import random
7
+ import hashlib
8
+ import requests
9
+
10
+
11
+ def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""):
12
+ """
13
+ :param Sentence: 待翻译语句
14
+ :param from_Language: 待翻译语句语言
15
+ :param to_Language: 目标语言
16
+ :return: 翻译后语句 出错时返回None
17
+
18
+ 常见语言代码:中文 zh 英语 en 日语 jp
19
+ """
20
+ appid = config.translate_config.app_key
21
+ key = config.translate_config.secret_key
22
+ if appid == "" or key == "":
23
+ return "请开发者在config.yml中配置app_key与secret_key"
24
+ url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
25
+ texts = Sentence.splitlines()
26
+ outTexts = []
27
+ for t in texts:
28
+ if t != "":
29
+ # 签名计算 参考文档 https://api.fanyi.baidu.com/product/113
30
+ salt = str(random.randint(1, 100000))
31
+ signString = appid + t + salt + key
32
+ hs = hashlib.md5()
33
+ hs.update(signString.encode("utf-8"))
34
+ signString = hs.hexdigest()
35
+ if from_Language == "":
36
+ from_Language = "auto"
37
+ headers = {"Content-Type": "application/x-www-form-urlencoded"}
38
+ payload = {
39
+ "q": t,
40
+ "from": from_Language,
41
+ "to": to_Language,
42
+ "appid": appid,
43
+ "salt": salt,
44
+ "sign": signString,
45
+ }
46
+ # 发送请求
47
+ try:
48
+ response = requests.post(
49
+ url=url, data=payload, headers=headers, timeout=3
50
+ )
51
+ response = response.json()
52
+ if "trans_result" in response.keys():
53
+ result = response["trans_result"][0]
54
+ if "dst" in result.keys():
55
+ dst = result["dst"]
56
+ outTexts.append(dst)
57
+ except Exception:
58
+ return Sentence
59
+ else:
60
+ outTexts.append(t)
61
+ return "\n".join(outTexts)