Mahiruoshi
commited on
Commit
•
1c576d6
1
Parent(s):
9b16e15
Update app.py
Browse files
app.py
CHANGED
@@ -16,6 +16,9 @@ import gradio as gr
|
|
16 |
import time
|
17 |
import datetime
|
18 |
import os
|
|
|
|
|
|
|
19 |
def is_japanese(string):
|
20 |
for ch in string:
|
21 |
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
|
@@ -58,6 +61,67 @@ def to_numpy(tensor: torch.Tensor):
|
|
58 |
return tensor.detach().cpu().numpy() if tensor.requires_grad \
|
59 |
else tensor.detach().numpy()
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def get_symbols_from_json(path):
|
62 |
assert os.path.isfile(path)
|
63 |
with open(path, 'r') as f:
|
@@ -90,7 +154,14 @@ def get_text(text,hps_ms):
|
|
90 |
|
91 |
def create_tts_fn(net_g,hps,speaker_id):
|
92 |
speaker_id = int(speaker_id)
|
93 |
-
def tts_fn(text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
if not extract:
|
95 |
print(text)
|
96 |
t1 = time.time()
|
@@ -104,7 +175,12 @@ def create_tts_fn(net_g,hps,speaker_id):
|
|
104 |
spending_time = "推理时间为:"+str(t2-t1)+"s"
|
105 |
print(spending_time)
|
106 |
file_path = "subtitles.srt"
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
108 |
else:
|
109 |
a = ['【','[','(','(']
|
110 |
b = ['】',']',')',')']
|
@@ -119,7 +195,7 @@ def create_tts_fn(net_g,hps,speaker_id):
|
|
119 |
f1 = open("subtitles.srt",'w',encoding='utf-8')
|
120 |
for sentence in final_list:
|
121 |
c +=1
|
122 |
-
stn_tst = get_text(sle(language,
|
123 |
with torch.no_grad():
|
124 |
x_tst = stn_tst.unsqueeze(0).to(dev)
|
125 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
|
@@ -136,8 +212,13 @@ def create_tts_fn(net_g,hps,speaker_id):
|
|
136 |
print(time_end)
|
137 |
f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
|
138 |
audio_fin.append(audio)
|
|
|
|
|
|
|
|
|
|
|
139 |
file_path = "subtitles.srt"
|
140 |
-
return (hps.data.sampling_rate, np.concatenate(audio_fin)),file_path
|
141 |
return tts_fn
|
142 |
|
143 |
|
@@ -174,36 +255,38 @@ if __name__ == '__main__':
|
|
174 |
models.append(content)
|
175 |
|
176 |
with gr.Blocks() as app:
|
177 |
-
gr.Markdown(
|
178 |
-
"## <center> LoveLive-ShojoKageki 中日双语Vits\n"
|
179 |
-
"### <center> 请不要生成会对个人以及企划造成侵害的内容\n"
|
180 |
-
'<div align="center"><a>参数说明:由于模型仍然在训练初期阶段,少歌部分质量较差,建议将噪声比例调节至0.2-0.3区间,ShojoKageki模型则可以尝试默认的0.667;噪声偏差对应着每个字之间的间隔,对普通话影响较大,建议0.6-0.8;duration代表整体语速</div>'
|
181 |
-
'<div align="center"><a>合成前请先选择模型,否则第一次合成不一定成功。长段落/小说合成建议colab或本地运行</div>')
|
182 |
with gr.Tabs():
|
183 |
for i in schools:
|
184 |
with gr.TabItem(i):
|
185 |
for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
|
186 |
with gr.TabItem(name):
|
187 |
-
with gr.
|
188 |
-
with gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
with gr.Row():
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
input1 = gr.TextArea(label="Text", value=example)
|
202 |
-
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
|
203 |
-
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
|
204 |
-
input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
|
205 |
-
btnVC = gr.Button("Submit")
|
206 |
-
output1 = gr.Audio(label="采样率22050")
|
207 |
-
btnVC.click(tts_fn, inputs=[input1, input2,input3,input4, input5, input6], outputs=[output1,output2])
|
208 |
|
209 |
-
app.launch()
|
|
|
16 |
import time
|
17 |
import datetime
|
18 |
import os
|
19 |
+
import pickle
|
20 |
+
import openai
|
21 |
+
from scipy.io.wavfile import write
|
22 |
def is_japanese(string):
|
23 |
for ch in string:
|
24 |
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
|
|
|
61 |
return tensor.detach().cpu().numpy() if tensor.requires_grad \
|
62 |
else tensor.detach().numpy()
|
63 |
|
64 |
+
def to_html(chat_history):
|
65 |
+
chat_html = ""
|
66 |
+
for item in chat_history:
|
67 |
+
if item['role'] == 'user':
|
68 |
+
chat_html += f"""
|
69 |
+
<div style="margin-bottom: 20px;">
|
70 |
+
<div style="text-align: right; margin-right: 20px;">
|
71 |
+
<span style="background-color: #4CAF50; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
|
72 |
+
{item['content']}
|
73 |
+
</span>
|
74 |
+
</div>
|
75 |
+
</div>
|
76 |
+
"""
|
77 |
+
else:
|
78 |
+
chat_html += f"""
|
79 |
+
<div style="margin-bottom: 20px;">
|
80 |
+
<div style="text-align: left; margin-left: 20px;">
|
81 |
+
<span style="background-color: white; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
|
82 |
+
{item['content']}
|
83 |
+
</span>
|
84 |
+
</div>
|
85 |
+
</div>
|
86 |
+
"""
|
87 |
+
output_html = f"""
|
88 |
+
<div style="height: 400px; overflow-y: scroll; padding: 10px;">
|
89 |
+
{chat_html}
|
90 |
+
</div>
|
91 |
+
"""
|
92 |
+
return output_html
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
def chatgpt(text):
|
97 |
+
messages = []
|
98 |
+
try:
|
99 |
+
with open('log.pickle', 'rb') as f:
|
100 |
+
messages = pickle.load(f)
|
101 |
+
messages.append({"role": "user", "content": text},)
|
102 |
+
chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
|
103 |
+
reply = chat.choices[0].message.content
|
104 |
+
messages.append({"role": "assistant", "content": reply})
|
105 |
+
print(messages[-1])
|
106 |
+
if len(messages) == 12:
|
107 |
+
messages[6:10] = messages[8:]
|
108 |
+
del messages[-2:]
|
109 |
+
with open('log.pickle', 'wb') as f:
|
110 |
+
pickle.dump(messages, f)
|
111 |
+
return reply,messages
|
112 |
+
except:
|
113 |
+
messages.append({"role": "user", "content": text},)
|
114 |
+
chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
|
115 |
+
reply = chat.choices[0].message.content
|
116 |
+
messages.append({"role": "assistant", "content": reply})
|
117 |
+
print(messages[-1])
|
118 |
+
if len(messages) == 12:
|
119 |
+
messages[6:10] = messages[8:]
|
120 |
+
del messages[-2:]
|
121 |
+
with open('log.pickle', 'wb') as f:
|
122 |
+
pickle.dump(messages, f)
|
123 |
+
return reply,messages
|
124 |
+
|
125 |
def get_symbols_from_json(path):
|
126 |
assert os.path.isfile(path)
|
127 |
with open(path, 'r') as f:
|
|
|
154 |
|
155 |
def create_tts_fn(net_g,hps,speaker_id):
|
156 |
speaker_id = int(speaker_id)
|
157 |
+
def tts_fn(is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
|
158 |
+
repeat_time = int(repeat_time)
|
159 |
+
if is_gpt:
|
160 |
+
openai.api_key = api_key
|
161 |
+
text,messages = chatgpt(text)
|
162 |
+
htm = to_html(messages)
|
163 |
+
else:
|
164 |
+
htm = ""
|
165 |
if not extract:
|
166 |
print(text)
|
167 |
t1 = time.time()
|
|
|
175 |
spending_time = "推理时间为:"+str(t2-t1)+"s"
|
176 |
print(spending_time)
|
177 |
file_path = "subtitles.srt"
|
178 |
+
write(audiopath + '.wav',22050,audio)
|
179 |
+
if is_audio:
|
180 |
+
for i in range(repeat_time):
|
181 |
+
cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
|
182 |
+
os.system(cmd)
|
183 |
+
return (hps.data.sampling_rate, audio),file_path,htm
|
184 |
else:
|
185 |
a = ['【','[','(','(']
|
186 |
b = ['】',']',')',')']
|
|
|
195 |
f1 = open("subtitles.srt",'w',encoding='utf-8')
|
196 |
for sentence in final_list:
|
197 |
c +=1
|
198 |
+
stn_tst = get_text(sle(language,sentence),hps)
|
199 |
with torch.no_grad():
|
200 |
x_tst = stn_tst.unsqueeze(0).to(dev)
|
201 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
|
|
|
212 |
print(time_end)
|
213 |
f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
|
214 |
audio_fin.append(audio)
|
215 |
+
write(audiopath + '.wav',22050,np.concatenate(audio_fin))
|
216 |
+
if is_audio:
|
217 |
+
for i in range(repeat_time):
|
218 |
+
cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
|
219 |
+
os.system(cmd)
|
220 |
file_path = "subtitles.srt"
|
221 |
+
return (hps.data.sampling_rate, np.concatenate(audio_fin)),file_path,htm
|
222 |
return tts_fn
|
223 |
|
224 |
|
|
|
255 |
models.append(content)
|
256 |
|
257 |
with gr.Blocks() as app:
|
|
|
|
|
|
|
|
|
|
|
258 |
with gr.Tabs():
|
259 |
for i in schools:
|
260 |
with gr.TabItem(i):
|
261 |
for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
|
262 |
with gr.TabItem(name):
|
263 |
+
with gr.Column():
|
264 |
+
with gr.Accordion(label="Setting", open=False):
|
265 |
+
input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
|
266 |
+
input3 = gr.Checkbox(value=True, label="长句切割(小说合成)")
|
267 |
+
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
|
268 |
+
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
|
269 |
+
input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
|
270 |
+
with gr.Accordion(label="Advanced Setting", open=False):
|
271 |
+
audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
|
272 |
+
api_input1 = gr.Checkbox(value=True, label="接入chatgpt")
|
273 |
+
api_input2 = gr.TextArea(label="api-key",lines=1,value = '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')
|
274 |
+
output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
|
275 |
+
audio_input1 = gr.Checkbox(value=True, label="修改音频路径(live2d)")
|
276 |
+
audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
|
277 |
+
|
278 |
+
with gr.Row():
|
279 |
with gr.Row():
|
280 |
+
gr.Markdown(
|
281 |
+
'<div align="center">'
|
282 |
+
f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
|
283 |
+
'</div>'
|
284 |
+
)
|
285 |
+
with gr.Row():
|
286 |
+
output_UI = gr.outputs.HTML()
|
287 |
+
input1 = gr.TextArea(label="Text", value=example, lines=1)
|
288 |
+
btnVC = gr.Button("Submit")
|
289 |
+
output1 = gr.Audio(label="采样率22050")
|
290 |
+
btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
|
292 |
+
app.launch()
|