Mahiruoshi commited on
Commit
1c576d6
1 Parent(s): 9b16e15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -30
app.py CHANGED
@@ -16,6 +16,9 @@ import gradio as gr
16
  import time
17
  import datetime
18
  import os
 
 
 
19
  def is_japanese(string):
20
  for ch in string:
21
  if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
@@ -58,6 +61,67 @@ def to_numpy(tensor: torch.Tensor):
58
  return tensor.detach().cpu().numpy() if tensor.requires_grad \
59
  else tensor.detach().numpy()
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def get_symbols_from_json(path):
62
  assert os.path.isfile(path)
63
  with open(path, 'r') as f:
@@ -90,7 +154,14 @@ def get_text(text,hps_ms):
90
 
91
  def create_tts_fn(net_g,hps,speaker_id):
92
  speaker_id = int(speaker_id)
93
- def tts_fn(text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
 
 
 
 
 
 
 
94
  if not extract:
95
  print(text)
96
  t1 = time.time()
@@ -104,7 +175,12 @@ def create_tts_fn(net_g,hps,speaker_id):
104
  spending_time = "推理时间为:"+str(t2-t1)+"s"
105
  print(spending_time)
106
  file_path = "subtitles.srt"
107
- return (hps.data.sampling_rate, audio),file_path
 
 
 
 
 
108
  else:
109
  a = ['【','[','(','(']
110
  b = ['】',']',')',')']
@@ -119,7 +195,7 @@ def create_tts_fn(net_g,hps,speaker_id):
119
  f1 = open("subtitles.srt",'w',encoding='utf-8')
120
  for sentence in final_list:
121
  c +=1
122
- stn_tst = get_text(sle(language,text),hps)
123
  with torch.no_grad():
124
  x_tst = stn_tst.unsqueeze(0).to(dev)
125
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
@@ -136,8 +212,13 @@ def create_tts_fn(net_g,hps,speaker_id):
136
  print(time_end)
137
  f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
138
  audio_fin.append(audio)
 
 
 
 
 
139
  file_path = "subtitles.srt"
140
- return (hps.data.sampling_rate, np.concatenate(audio_fin)),file_path
141
  return tts_fn
142
 
143
 
@@ -174,36 +255,38 @@ if __name__ == '__main__':
174
  models.append(content)
175
 
176
  with gr.Blocks() as app:
177
- gr.Markdown(
178
- "## <center> LoveLive-ShojoKageki 中日双语Vits\n"
179
- "### <center> 请不要生成会对个人以及企划造成侵害的内容\n"
180
- '<div align="center"><a>参数说明:由于模型仍然在训练初期阶段,少歌部分质量较差,建议将噪声比例调节至0.2-0.3区间,ShojoKageki模型则可以尝试默认的0.667;噪声偏差对应着每个字之间的间隔,对普通话影响较大,建议0.6-0.8;duration代表整体语速</div>'
181
- '<div align="center"><a>合成前请先选择模型,否则第一次合成不一定成功。长段落/小说合成建议colab或本地运行</div>')
182
  with gr.Tabs():
183
  for i in schools:
184
  with gr.TabItem(i):
185
  for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
186
  with gr.TabItem(name):
187
- with gr.Row():
188
- with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  with gr.Row():
190
- with gr.Column():
191
- gr.Markdown(
192
- '<div align="center">'
193
- f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
194
- '</div>'
195
- )
196
- with gr.Accordion(label="Advanced Options", open=False):
197
- input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
198
- input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
199
- output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
200
- with gr.Column():
201
- input1 = gr.TextArea(label="Text", value=example)
202
- input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
203
- input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
204
- input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
205
- btnVC = gr.Button("Submit")
206
- output1 = gr.Audio(label="采样率22050")
207
- btnVC.click(tts_fn, inputs=[input1, input2,input3,input4, input5, input6], outputs=[output1,output2])
208
 
209
- app.launch()
 
16
  import time
17
  import datetime
18
  import os
19
+ import pickle
20
+ import openai
21
+ from scipy.io.wavfile import write
22
  def is_japanese(string):
23
  for ch in string:
24
  if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
 
61
  return tensor.detach().cpu().numpy() if tensor.requires_grad \
62
  else tensor.detach().numpy()
63
 
64
+ def to_html(chat_history):
65
+ chat_html = ""
66
+ for item in chat_history:
67
+ if item['role'] == 'user':
68
+ chat_html += f"""
69
+ <div style="margin-bottom: 20px;">
70
+ <div style="text-align: right; margin-right: 20px;">
71
+ <span style="background-color: #4CAF50; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
72
+ {item['content']}
73
+ </span>
74
+ </div>
75
+ </div>
76
+ """
77
+ else:
78
+ chat_html += f"""
79
+ <div style="margin-bottom: 20px;">
80
+ <div style="text-align: left; margin-left: 20px;">
81
+ <span style="background-color: white; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
82
+ {item['content']}
83
+ </span>
84
+ </div>
85
+ </div>
86
+ """
87
+ output_html = f"""
88
+ <div style="height: 400px; overflow-y: scroll; padding: 10px;">
89
+ {chat_html}
90
+ </div>
91
+ """
92
+ return output_html
93
+
94
+
95
+
96
+ def chatgpt(text):
97
+ messages = []
98
+ try:
99
+ with open('log.pickle', 'rb') as f:
100
+ messages = pickle.load(f)
101
+ messages.append({"role": "user", "content": text},)
102
+ chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
103
+ reply = chat.choices[0].message.content
104
+ messages.append({"role": "assistant", "content": reply})
105
+ print(messages[-1])
106
+ if len(messages) == 12:
107
+ messages[6:10] = messages[8:]
108
+ del messages[-2:]
109
+ with open('log.pickle', 'wb') as f:
110
+ pickle.dump(messages, f)
111
+ return reply,messages
112
+ except:
113
+ messages.append({"role": "user", "content": text},)
114
+ chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
115
+ reply = chat.choices[0].message.content
116
+ messages.append({"role": "assistant", "content": reply})
117
+ print(messages[-1])
118
+ if len(messages) == 12:
119
+ messages[6:10] = messages[8:]
120
+ del messages[-2:]
121
+ with open('log.pickle', 'wb') as f:
122
+ pickle.dump(messages, f)
123
+ return reply,messages
124
+
125
  def get_symbols_from_json(path):
126
  assert os.path.isfile(path)
127
  with open(path, 'r') as f:
 
154
 
155
  def create_tts_fn(net_g,hps,speaker_id):
156
  speaker_id = int(speaker_id)
157
+ def tts_fn(is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
158
+ repeat_time = int(repeat_time)
159
+ if is_gpt:
160
+ openai.api_key = api_key
161
+ text,messages = chatgpt(text)
162
+ htm = to_html(messages)
163
+ else:
164
+ htm = ""
165
  if not extract:
166
  print(text)
167
  t1 = time.time()
 
175
  spending_time = "推理时间为:"+str(t2-t1)+"s"
176
  print(spending_time)
177
  file_path = "subtitles.srt"
178
+ write(audiopath + '.wav',22050,audio)
179
+ if is_audio:
180
+ for i in range(repeat_time):
181
+ cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
182
+ os.system(cmd)
183
+ return (hps.data.sampling_rate, audio),file_path,htm
184
  else:
185
  a = ['【','[','(','(']
186
  b = ['】',']',')',')']
 
195
  f1 = open("subtitles.srt",'w',encoding='utf-8')
196
  for sentence in final_list:
197
  c +=1
198
+ stn_tst = get_text(sle(language,sentence),hps)
199
  with torch.no_grad():
200
  x_tst = stn_tst.unsqueeze(0).to(dev)
201
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
 
212
  print(time_end)
213
  f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
214
  audio_fin.append(audio)
215
+ write(audiopath + '.wav',22050,np.concatenate(audio_fin))
216
+ if is_audio:
217
+ for i in range(repeat_time):
218
+ cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
219
+ os.system(cmd)
220
  file_path = "subtitles.srt"
221
+ return (hps.data.sampling_rate, np.concatenate(audio_fin)),file_path,htm
222
  return tts_fn
223
 
224
 
 
255
  models.append(content)
256
 
257
  with gr.Blocks() as app:
 
 
 
 
 
258
  with gr.Tabs():
259
  for i in schools:
260
  with gr.TabItem(i):
261
  for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
262
  with gr.TabItem(name):
263
+ with gr.Column():
264
+ with gr.Accordion(label="Setting", open=False):
265
+ input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
266
+ input3 = gr.Checkbox(value=True, label="长句切割(小说合成)")
267
+ input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
268
+ input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
269
+ input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
270
+ with gr.Accordion(label="Advanced Setting", open=False):
271
+ audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
272
+ api_input1 = gr.Checkbox(value=True, label="接入chatgpt")
273
+ api_input2 = gr.TextArea(label="api-key",lines=1,value = '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')
274
+ output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
275
+ audio_input1 = gr.Checkbox(value=True, label="修改音频路径(live2d)")
276
+ audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
277
+
278
+ with gr.Row():
279
  with gr.Row():
280
+ gr.Markdown(
281
+ '<div align="center">'
282
+ f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
283
+ '</div>'
284
+ )
285
+ with gr.Row():
286
+ output_UI = gr.outputs.HTML()
287
+ input1 = gr.TextArea(label="Text", value=example, lines=1)
288
+ btnVC = gr.Button("Submit")
289
+ output1 = gr.Audio(label="采样率22050")
290
+ btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
 
 
 
 
 
 
 
291
 
292
+ app.launch()