ElesisSiegherts
commited on
Commit
•
c729232
1
Parent(s):
a2521ef
Upload 15 files
Browse files- tools/__init__.py +3 -0
- tools/__pycache__/__init__.cpython-310.pyc +0 -0
- tools/__pycache__/__init__.cpython-38.pyc +0 -0
- tools/__pycache__/classify_language.cpython-310.pyc +0 -0
- tools/__pycache__/classify_language.cpython-38.pyc +0 -0
- tools/__pycache__/log.cpython-310.pyc +0 -0
- tools/__pycache__/log.cpython-38.pyc +0 -0
- tools/__pycache__/sentence.cpython-310.pyc +0 -0
- tools/__pycache__/sentence.cpython-38.pyc +0 -0
- tools/__pycache__/translate.cpython-310.pyc +0 -0
- tools/__pycache__/translate.cpython-38.pyc +0 -0
- tools/classify_language.py +189 -0
- tools/log.py +16 -0
- tools/sentence.py +169 -0
- tools/translate.py +61 -0
tools/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
工具包
|
3 |
+
"""
|
tools/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (160 Bytes). View file
|
|
tools/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (158 Bytes). View file
|
|
tools/__pycache__/classify_language.cpython-310.pyc
ADDED
Binary file (2.78 kB). View file
|
|
tools/__pycache__/classify_language.cpython-38.pyc
ADDED
Binary file (3.17 kB). View file
|
|
tools/__pycache__/log.cpython-310.pyc
ADDED
Binary file (399 Bytes). View file
|
|
tools/__pycache__/log.cpython-38.pyc
ADDED
Binary file (397 Bytes). View file
|
|
tools/__pycache__/sentence.cpython-310.pyc
ADDED
Binary file (4.65 kB). View file
|
|
tools/__pycache__/sentence.cpython-38.pyc
ADDED
Binary file (4.63 kB). View file
|
|
tools/__pycache__/translate.cpython-310.pyc
ADDED
Binary file (1.6 kB). View file
|
|
tools/__pycache__/translate.cpython-38.pyc
ADDED
Binary file (1.6 kB). View file
|
|
tools/classify_language.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import regex as re
|
2 |
+
|
3 |
+
try:
|
4 |
+
from config import config
|
5 |
+
|
6 |
+
LANGUAGE_IDENTIFICATION_LIBRARY = (
|
7 |
+
config.webui_config.language_identification_library
|
8 |
+
)
|
9 |
+
except:
|
10 |
+
LANGUAGE_IDENTIFICATION_LIBRARY = "langid"
|
11 |
+
|
12 |
+
module = LANGUAGE_IDENTIFICATION_LIBRARY.lower()
|
13 |
+
|
14 |
+
langid_languages = [
|
15 |
+
"af",
|
16 |
+
"am",
|
17 |
+
"an",
|
18 |
+
"ar",
|
19 |
+
"as",
|
20 |
+
"az",
|
21 |
+
"be",
|
22 |
+
"bg",
|
23 |
+
"bn",
|
24 |
+
"br",
|
25 |
+
"bs",
|
26 |
+
"ca",
|
27 |
+
"cs",
|
28 |
+
"cy",
|
29 |
+
"da",
|
30 |
+
"de",
|
31 |
+
"dz",
|
32 |
+
"el",
|
33 |
+
"en",
|
34 |
+
"eo",
|
35 |
+
"es",
|
36 |
+
"et",
|
37 |
+
"eu",
|
38 |
+
"fa",
|
39 |
+
"fi",
|
40 |
+
"fo",
|
41 |
+
"fr",
|
42 |
+
"ga",
|
43 |
+
"gl",
|
44 |
+
"gu",
|
45 |
+
"he",
|
46 |
+
"hi",
|
47 |
+
"hr",
|
48 |
+
"ht",
|
49 |
+
"hu",
|
50 |
+
"hy",
|
51 |
+
"id",
|
52 |
+
"is",
|
53 |
+
"it",
|
54 |
+
"ja",
|
55 |
+
"jv",
|
56 |
+
"ka",
|
57 |
+
"kk",
|
58 |
+
"km",
|
59 |
+
"kn",
|
60 |
+
"ko",
|
61 |
+
"ku",
|
62 |
+
"ky",
|
63 |
+
"la",
|
64 |
+
"lb",
|
65 |
+
"lo",
|
66 |
+
"lt",
|
67 |
+
"lv",
|
68 |
+
"mg",
|
69 |
+
"mk",
|
70 |
+
"ml",
|
71 |
+
"mn",
|
72 |
+
"mr",
|
73 |
+
"ms",
|
74 |
+
"mt",
|
75 |
+
"nb",
|
76 |
+
"ne",
|
77 |
+
"nl",
|
78 |
+
"nn",
|
79 |
+
"no",
|
80 |
+
"oc",
|
81 |
+
"or",
|
82 |
+
"pa",
|
83 |
+
"pl",
|
84 |
+
"ps",
|
85 |
+
"pt",
|
86 |
+
"qu",
|
87 |
+
"ro",
|
88 |
+
"ru",
|
89 |
+
"rw",
|
90 |
+
"se",
|
91 |
+
"si",
|
92 |
+
"sk",
|
93 |
+
"sl",
|
94 |
+
"sq",
|
95 |
+
"sr",
|
96 |
+
"sv",
|
97 |
+
"sw",
|
98 |
+
"ta",
|
99 |
+
"te",
|
100 |
+
"th",
|
101 |
+
"tl",
|
102 |
+
"tr",
|
103 |
+
"ug",
|
104 |
+
"uk",
|
105 |
+
"ur",
|
106 |
+
"vi",
|
107 |
+
"vo",
|
108 |
+
"wa",
|
109 |
+
"xh",
|
110 |
+
"zh",
|
111 |
+
"zu",
|
112 |
+
]
|
113 |
+
|
114 |
+
|
115 |
+
def classify_language(text: str, target_languages: list = None) -> str:
|
116 |
+
if module == "fastlid" or module == "fasttext":
|
117 |
+
from fastlid import fastlid, supported_langs
|
118 |
+
|
119 |
+
classifier = fastlid
|
120 |
+
if target_languages != None:
|
121 |
+
target_languages = [
|
122 |
+
lang for lang in target_languages if lang in supported_langs
|
123 |
+
]
|
124 |
+
fastlid.set_languages = target_languages
|
125 |
+
elif module == "langid":
|
126 |
+
import langid
|
127 |
+
|
128 |
+
classifier = langid.classify
|
129 |
+
if target_languages != None:
|
130 |
+
target_languages = [
|
131 |
+
lang for lang in target_languages if lang in langid_languages
|
132 |
+
]
|
133 |
+
langid.set_languages(target_languages)
|
134 |
+
else:
|
135 |
+
raise ValueError(f"Wrong module {module}")
|
136 |
+
|
137 |
+
lang = classifier(text)[0]
|
138 |
+
|
139 |
+
return lang
|
140 |
+
|
141 |
+
|
142 |
+
def classify_zh_ja(text: str) -> str:
|
143 |
+
for idx, char in enumerate(text):
|
144 |
+
unicode_val = ord(char)
|
145 |
+
|
146 |
+
# 检测日语字符
|
147 |
+
if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF:
|
148 |
+
return "ja"
|
149 |
+
|
150 |
+
# 检测汉字字符
|
151 |
+
if 0x4E00 <= unicode_val <= 0x9FFF:
|
152 |
+
# 检查周围的字符
|
153 |
+
next_char = text[idx + 1] if idx + 1 < len(text) else None
|
154 |
+
|
155 |
+
if next_char and (
|
156 |
+
0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF
|
157 |
+
):
|
158 |
+
return "ja"
|
159 |
+
|
160 |
+
return "zh"
|
161 |
+
|
162 |
+
|
163 |
+
def split_alpha_nonalpha(text, mode=1):
|
164 |
+
if mode == 1:
|
165 |
+
pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF\d])(?=[\p{Latin}])|(?<=[\p{Latin}])(?=[\u4e00-\u9fff\u3040-\u30FF\d])"
|
166 |
+
elif mode == 2:
|
167 |
+
pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF])(?=[\p{Latin}\d])|(?<=[\p{Latin}\d])(?=[\u4e00-\u9fff\u3040-\u30FF])"
|
168 |
+
else:
|
169 |
+
raise ValueError("Invalid mode. Supported modes are 1 and 2.")
|
170 |
+
|
171 |
+
return re.split(pattern, text)
|
172 |
+
|
173 |
+
|
174 |
+
if __name__ == "__main__":
|
175 |
+
text = "这是一个测试文本"
|
176 |
+
print(classify_language(text))
|
177 |
+
print(classify_zh_ja(text)) # "zh"
|
178 |
+
|
179 |
+
text = "これはテストテキストです"
|
180 |
+
print(classify_language(text))
|
181 |
+
print(classify_zh_ja(text)) # "ja"
|
182 |
+
|
183 |
+
text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days"
|
184 |
+
|
185 |
+
print(split_alpha_nonalpha(text, mode=1))
|
186 |
+
# output: ['vits', '和', 'Bert-VITS', '2是', 'tts', '模型。花费3', 'days.花费3天。Take 3 days']
|
187 |
+
|
188 |
+
print(split_alpha_nonalpha(text, mode=2))
|
189 |
+
# output: ['vits', '和', 'Bert-VITS2', '是', 'tts', '模型。花费', '3days.花费', '3', '天。Take 3 days']
|
tools/log.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
logger封装
|
3 |
+
"""
|
4 |
+
from loguru import logger
|
5 |
+
import sys
|
6 |
+
|
7 |
+
|
8 |
+
# 移除所有默认的处理器
|
9 |
+
logger.remove()
|
10 |
+
|
11 |
+
# 自定义格式并添加到标准输出
|
12 |
+
log_format = (
|
13 |
+
"<g>{time:MM-DD HH:mm:ss}</g> <lvl>{level:<9}</lvl>| {file}:{line} | {message}"
|
14 |
+
)
|
15 |
+
|
16 |
+
logger.add(sys.stdout, format=log_format, backtrace=True, diagnose=True)
|
tools/sentence.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
import regex as re
|
4 |
+
|
5 |
+
from tools.classify_language import classify_language, split_alpha_nonalpha
|
6 |
+
|
7 |
+
|
8 |
+
def check_is_none(item) -> bool:
|
9 |
+
"""none -> True, not none -> False"""
|
10 |
+
return (
|
11 |
+
item is None
|
12 |
+
or (isinstance(item, str) and str(item).isspace())
|
13 |
+
or str(item) == ""
|
14 |
+
)
|
15 |
+
|
16 |
+
|
17 |
+
def markup_language(text: str, target_languages: list = None) -> str:
|
18 |
+
pattern = (
|
19 |
+
r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
|
20 |
+
r"\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
|
21 |
+
r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
|
22 |
+
)
|
23 |
+
sentences = re.split(pattern, text)
|
24 |
+
|
25 |
+
pre_lang = ""
|
26 |
+
p = 0
|
27 |
+
|
28 |
+
if target_languages is not None:
|
29 |
+
sorted_target_languages = sorted(target_languages)
|
30 |
+
if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
|
31 |
+
new_sentences = []
|
32 |
+
for sentence in sentences:
|
33 |
+
new_sentences.extend(split_alpha_nonalpha(sentence))
|
34 |
+
sentences = new_sentences
|
35 |
+
|
36 |
+
for sentence in sentences:
|
37 |
+
if check_is_none(sentence):
|
38 |
+
continue
|
39 |
+
|
40 |
+
lang = classify_language(sentence, target_languages)
|
41 |
+
|
42 |
+
if pre_lang == "":
|
43 |
+
text = text[:p] + text[p:].replace(
|
44 |
+
sentence, f"[{lang.upper()}]{sentence}", 1
|
45 |
+
)
|
46 |
+
p += len(f"[{lang.upper()}]")
|
47 |
+
elif pre_lang != lang:
|
48 |
+
text = text[:p] + text[p:].replace(
|
49 |
+
sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1
|
50 |
+
)
|
51 |
+
p += len(f"[{pre_lang.upper()}][{lang.upper()}]")
|
52 |
+
pre_lang = lang
|
53 |
+
p += text[p:].index(sentence) + len(sentence)
|
54 |
+
text += f"[{pre_lang.upper()}]"
|
55 |
+
|
56 |
+
return text
|
57 |
+
|
58 |
+
|
59 |
+
def split_by_language(text: str, target_languages: list = None) -> list:
|
60 |
+
pattern = (
|
61 |
+
r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
|
62 |
+
r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
|
63 |
+
r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
|
64 |
+
)
|
65 |
+
sentences = re.split(pattern, text)
|
66 |
+
|
67 |
+
pre_lang = ""
|
68 |
+
start = 0
|
69 |
+
end = 0
|
70 |
+
sentences_list = []
|
71 |
+
|
72 |
+
if target_languages is not None:
|
73 |
+
sorted_target_languages = sorted(target_languages)
|
74 |
+
if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
|
75 |
+
new_sentences = []
|
76 |
+
for sentence in sentences:
|
77 |
+
new_sentences.extend(split_alpha_nonalpha(sentence))
|
78 |
+
sentences = new_sentences
|
79 |
+
|
80 |
+
for sentence in sentences:
|
81 |
+
if check_is_none(sentence):
|
82 |
+
continue
|
83 |
+
|
84 |
+
lang = classify_language(sentence, target_languages)
|
85 |
+
|
86 |
+
end += text[end:].index(sentence)
|
87 |
+
if pre_lang != "" and pre_lang != lang:
|
88 |
+
sentences_list.append((text[start:end], pre_lang))
|
89 |
+
start = end
|
90 |
+
end += len(sentence)
|
91 |
+
pre_lang = lang
|
92 |
+
sentences_list.append((text[start:], pre_lang))
|
93 |
+
|
94 |
+
return sentences_list
|
95 |
+
|
96 |
+
|
97 |
+
def sentence_split(text: str, max: int) -> list:
|
98 |
+
pattern = r"[!(),—+\-.:;??。,、;:]+"
|
99 |
+
sentences = re.split(pattern, text)
|
100 |
+
discarded_chars = re.findall(pattern, text)
|
101 |
+
|
102 |
+
sentences_list, count, p = [], 0, 0
|
103 |
+
|
104 |
+
# 按被分割的符号遍历
|
105 |
+
for i, discarded_chars in enumerate(discarded_chars):
|
106 |
+
count += len(sentences[i]) + len(discarded_chars)
|
107 |
+
if count >= max:
|
108 |
+
sentences_list.append(text[p : p + count].strip())
|
109 |
+
p += count
|
110 |
+
count = 0
|
111 |
+
|
112 |
+
# 加入最后剩余的文本
|
113 |
+
if p < len(text):
|
114 |
+
sentences_list.append(text[p:])
|
115 |
+
|
116 |
+
return sentences_list
|
117 |
+
|
118 |
+
|
119 |
+
def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None):
|
120 |
+
# 如果该speaker只支持一种语言
|
121 |
+
if speaker_lang is not None and len(speaker_lang) == 1:
|
122 |
+
if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
|
123 |
+
logging.debug(
|
124 |
+
f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}'
|
125 |
+
)
|
126 |
+
lang = speaker_lang[0]
|
127 |
+
|
128 |
+
sentences_list = []
|
129 |
+
if lang.upper() != "MIX":
|
130 |
+
if max <= 0:
|
131 |
+
sentences_list.append(
|
132 |
+
markup_language(text, speaker_lang)
|
133 |
+
if lang.upper() == "AUTO"
|
134 |
+
else f"[{lang.upper()}]{text}[{lang.upper()}]"
|
135 |
+
)
|
136 |
+
else:
|
137 |
+
for i in sentence_split(text, max):
|
138 |
+
if check_is_none(i):
|
139 |
+
continue
|
140 |
+
sentences_list.append(
|
141 |
+
markup_language(i, speaker_lang)
|
142 |
+
if lang.upper() == "AUTO"
|
143 |
+
else f"[{lang.upper()}]{i}[{lang.upper()}]"
|
144 |
+
)
|
145 |
+
else:
|
146 |
+
sentences_list.append(text)
|
147 |
+
|
148 |
+
for i in sentences_list:
|
149 |
+
logging.debug(i)
|
150 |
+
|
151 |
+
return sentences_list
|
152 |
+
|
153 |
+
|
154 |
+
if __name__ == "__main__":
|
155 |
+
text = "这几天心里颇不宁静。今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。"
|
156 |
+
print(markup_language(text, target_languages=None))
|
157 |
+
print(sentence_split(text, max=50))
|
158 |
+
print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None))
|
159 |
+
|
160 |
+
text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
|
161 |
+
print(split_by_language(text, ["zh", "ja", "en"]))
|
162 |
+
|
163 |
+
text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days"
|
164 |
+
|
165 |
+
print(split_by_language(text, ["zh", "ja", "en"]))
|
166 |
+
# output: [('vits', 'en'), ('和', 'ja'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
|
167 |
+
|
168 |
+
print(split_by_language(text, ["zh", "en"]))
|
169 |
+
# output: [('vits', 'en'), ('和', 'zh'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
|
tools/translate.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
翻译api
|
3 |
+
"""
|
4 |
+
from config import config
|
5 |
+
|
6 |
+
import random
|
7 |
+
import hashlib
|
8 |
+
import requests
|
9 |
+
|
10 |
+
|
11 |
+
def translate(Sentence: str, to_Language: str = "jp", from_Language: str = ""):
|
12 |
+
"""
|
13 |
+
:param Sentence: 待翻译语句
|
14 |
+
:param from_Language: 待翻译语句语言
|
15 |
+
:param to_Language: 目标语言
|
16 |
+
:return: 翻译后语句 出错时返回None
|
17 |
+
|
18 |
+
常见语言代码:中文 zh 英语 en 日语 jp
|
19 |
+
"""
|
20 |
+
appid = config.translate_config.app_key
|
21 |
+
key = config.translate_config.secret_key
|
22 |
+
if appid == "" or key == "":
|
23 |
+
return "请开发者在config.yml中配置app_key与secret_key"
|
24 |
+
url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
|
25 |
+
texts = Sentence.splitlines()
|
26 |
+
outTexts = []
|
27 |
+
for t in texts:
|
28 |
+
if t != "":
|
29 |
+
# 签名计算 参考文档 https://api.fanyi.baidu.com/product/113
|
30 |
+
salt = str(random.randint(1, 100000))
|
31 |
+
signString = appid + t + salt + key
|
32 |
+
hs = hashlib.md5()
|
33 |
+
hs.update(signString.encode("utf-8"))
|
34 |
+
signString = hs.hexdigest()
|
35 |
+
if from_Language == "":
|
36 |
+
from_Language = "auto"
|
37 |
+
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
38 |
+
payload = {
|
39 |
+
"q": t,
|
40 |
+
"from": from_Language,
|
41 |
+
"to": to_Language,
|
42 |
+
"appid": appid,
|
43 |
+
"salt": salt,
|
44 |
+
"sign": signString,
|
45 |
+
}
|
46 |
+
# 发送请求
|
47 |
+
try:
|
48 |
+
response = requests.post(
|
49 |
+
url=url, data=payload, headers=headers, timeout=3
|
50 |
+
)
|
51 |
+
response = response.json()
|
52 |
+
if "trans_result" in response.keys():
|
53 |
+
result = response["trans_result"][0]
|
54 |
+
if "dst" in result.keys():
|
55 |
+
dst = result["dst"]
|
56 |
+
outTexts.append(dst)
|
57 |
+
except Exception:
|
58 |
+
return Sentence
|
59 |
+
else:
|
60 |
+
outTexts.append(t)
|
61 |
+
return "\n".join(outTexts)
|