Mahiruoshi commited on
Commit
d1de440
1 Parent(s): de3b6be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +184 -1
app.py CHANGED
@@ -18,6 +18,188 @@ from text.symbols import symbols
18
  from text import text_to_sequence
19
  import unicodedata
20
  from scipy.io.wavfile import write
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def get_text(text, hps):
22
  text_norm = text_to_sequence(text, hps.data.text_cleaners)
23
  if hps.data.add_blank:
@@ -142,4 +324,5 @@ with app:
142
  tts_output3 = gr.Image(label = "Model")
143
  tts_submit.click(infer, [language,tts_input1,speaker1,para_input1,para_input2,para_input3], [tts_output2,tts_output3])
144
  #app.launch(share=True)
145
- app.launch()
 
 
18
  from text import text_to_sequence
19
  import unicodedata
20
  from scipy.io.wavfile import write
21
+ import openai
22
+
23
+ def get_text(text, hps):
24
+ text_norm = text_to_sequence(text, hps.data.text_cleaners)
25
+ if hps.data.add_blank:
26
+ text_norm = commons.intersperse(text_norm, 0)
27
+ text_norm = torch.LongTensor(text_norm)
28
+ return text_norm
29
+
30
+ def get_label(text, label):
31
+ if f'[{label}]' in text:
32
+ return True, text.replace(f'[{label}]', '')
33
+ else:
34
+ return False, text
35
+
36
+ def selection(speaker):
37
+ if speaker == "高咲侑(误)":
38
+ spk = 0
39
+ return spk
40
+
41
+ elif speaker == "歩夢":
42
+ spk = 1
43
+ return spk
44
+
45
+ elif speaker == "かすみ":
46
+ spk = 2
47
+ return spk
48
+
49
+ elif speaker == "しずく":
50
+ spk = 3
51
+ return spk
52
+
53
+ elif speaker == "果林":
54
+ spk = 4
55
+ return spk
56
+
57
+ elif speaker == "愛":
58
+ spk = 5
59
+ return spk
60
+
61
+ elif speaker == "彼方":
62
+ spk = 6
63
+ return spk
64
+
65
+ elif speaker == "せつ菜":
66
+ spk = 7
67
+ return spk
68
+ elif speaker == "エマ":
69
+ spk = 8
70
+ return spk
71
+ elif speaker == "璃奈":
72
+ spk = 9
73
+ return spk
74
+ elif speaker == "栞子":
75
+ spk = 10
76
+ return spk
77
+ elif speaker == "ランジュ":
78
+ spk = 11
79
+ return spk
80
+ elif speaker == "ミア":
81
+ spk = 12
82
+ return spk
83
+ elif speaker == "三色绘恋1":
84
+ spk = 13
85
+ return spk
86
+ elif speaker == "三色绘恋2":
87
+ spk = 15
88
+ return spk
89
+ elif speaker == "派蒙":
90
+ spk = 16
91
+ return spk
92
+ def friend_chat(text,key,call_name,tts_input3):
93
+ call_name = call_name
94
+ openai.api_key = key
95
+ identity = tts_input3
96
+ start_sequence = '\n'+str(call_name)+':'
97
+ restart_sequence = "\nYou: "
98
+ all_text = identity + restart_sequence
99
+ if 1 == 1:
100
+ prompt0 = text #当期prompt
101
+ if text == 'quit':
102
+ return prompt0
103
+ prompt = identity + prompt0 + start_sequence
104
+
105
+ response = openai.Completion.create(
106
+ model="text-davinci-003",
107
+ prompt=prompt,
108
+ temperature=0.5,
109
+ max_tokens=1000,
110
+ top_p=1.0,
111
+ frequency_penalty=0.5,
112
+ presence_penalty=0.0,
113
+ stop=["\nYou:"]
114
+ )
115
+ return response['choices'][0]['text'].strip()
116
+ def is_japanese(string):
117
+ for ch in string:
118
+ if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
119
+ return True
120
+ return False
121
+ def sle(language,text,tts_input2,call_name,tts_input3):
122
+ if language == "中文":
123
+ tts_input1 = "[ZH]" + text.replace('\n','。').replace(' ',',') + "[ZH]"
124
+ return tts_input1
125
+ if language == "对话":
126
+ text = friend_chat(text,tts_input2,call_name,tts_input3).replace('\n','。').replace(' ',',')
127
+ text = f"[JA]{text}[JA]" if is_japanese(text) else f"[ZH]{text}[ZH]"
128
+ return text
129
+ elif language == "日文":
130
+ tts_input1 = "[JA]" + text.replace('\n','。').replace(' ',',') + "[JA]"
131
+ return tts_input1
132
+ def infer(language,text,tts_input2,tts_input3,speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
133
+ speaker_name = speaker_id
134
+ speaker_id = int(selection(speaker_id))
135
+ stn_tst = get_text(sle(language,text,tts_input2,speaker_name,tts_input3), hps_ms)
136
+ with torch.no_grad():
137
+ x_tst = stn_tst.unsqueeze(0).to(dev)
138
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
139
+ sid = torch.LongTensor([speaker_id]).to(dev)
140
+ t1 = time.time()
141
+ audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
142
+ t2 = time.time()
143
+ spending_time = "推理时间:"+str(t2-t1)+"s"
144
+ print(spending_time)
145
+ return (hps_ms.data.sampling_rate, audio)
146
+ lan = ["中文","日文","对话"]
147
+ idols = ["高咲侑(误)","歩夢","かすみ","しずく","果林","愛","彼方","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","三色绘恋1","三色绘恋2","派蒙"]
148
+
149
+
150
+ dev = torch.device("cpu")
151
+ hps_ms = utils.get_hparams_from_file("config.json")
152
+ net_g_ms = SynthesizerTrn(
153
+ len(symbols),
154
+ hps_ms.data.filter_length // 2 + 1,
155
+ hps_ms.train.segment_size // hps_ms.data.hop_length,
156
+ n_speakers=hps_ms.data.n_speakers,
157
+ **hps_ms.model).to(dev)
158
+ _ = net_g_ms.eval()
159
+
160
+ _ = utils.load_checkpoint("G_1049000.pth", net_g_ms, None)
161
+
162
+ app = gr.Blocks()
163
+
164
+ with app:
165
+ with gr.Tabs():
166
+
167
+ with gr.TabItem("Basic"):
168
+
169
+ tts_input1 = gr.TextArea(label="输入你的文本", value="一次審査、二次審査、それぞれの欄に記入をお願いします。")
170
+ tts_input2 = gr.TextArea(label="如需使用openai,输入你的openai-key", value="官网")
171
+ tts_input3 = gr.TextArea(label="写上你给她的设定", value="恶魔系学妹。")
172
+ language = gr.Dropdown(label="选择合成方式",choices=lan, value="对话", interactive=True)
173
+ para_input1 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声比例", value=0.667)
174
+ para_input2 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声偏差", value=0.8)
175
+ para_input3 = gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1)
176
+ tts_submit = gr.Button("Generate", variant="primary")
177
+ speaker1 = gr.Dropdown(label="选择说话人",choices=idols, value="かすみ", interactive=True)
178
+ tts_output2 = gr.Audio(label="Output")
179
+ tts_submit.click(infer, [language,tts_input1,tts_input2,tts_input3,speaker1,para_input1,para_input2,para_input3], [tts_output2])
180
+ #app.launch(share=True)
181
+ app.launch()
182
+ '''
183
+ import time
184
+ import matplotlib.pyplot as plt
185
+ import IPython.display as ipd
186
+ import re
187
+ import os
188
+ import json
189
+ import math
190
+ import torch
191
+ from torch import nn
192
+ from torch.nn import functional as F
193
+ from torch.utils.data import DataLoader
194
+ import gradio as gr
195
+ import commons
196
+ import utils
197
+ from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
198
+ from models import SynthesizerTrn
199
+ from text.symbols import symbols
200
+ from text import text_to_sequence
201
+ import unicodedata
202
+ from scipy.io.wavfile import write
203
  def get_text(text, hps):
204
  text_norm = text_to_sequence(text, hps.data.text_cleaners)
205
  if hps.data.add_blank:
 
324
  tts_output3 = gr.Image(label = "Model")
325
  tts_submit.click(infer, [language,tts_input1,speaker1,para_input1,para_input2,para_input3], [tts_output2,tts_output3])
326
  #app.launch(share=True)
327
+ app.launch()
328
+ '''