|
import logging |
|
logging.getLogger('numba').setLevel(logging.WARNING) |
|
logging.getLogger('matplotlib').setLevel(logging.WARNING) |
|
logging.getLogger('urllib3').setLevel(logging.WARNING) |
|
import json |
|
import re |
|
import numpy as np |
|
import IPython.display as ipd |
|
import torch |
|
import commons |
|
import utils |
|
from models import SynthesizerTrn |
|
from text.symbols import symbols |
|
from text import text_to_sequence |
|
import gradio as gr |
|
import time |
|
import datetime |
|
import os |
|
import pickle |
|
import openai |
|
from scipy.io.wavfile import write |
|
import librosa |
|
import romajitable |
|
from mel_processing import spectrogram_torch |
|
def is_japanese(string): |
|
for ch in string: |
|
if ord(ch) > 0x3040 and ord(ch) < 0x30FF: |
|
return True |
|
return False |
|
|
|
def is_english(string): |
|
import re |
|
pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$') |
|
if pattern.fullmatch(string): |
|
return True |
|
else: |
|
return False |
|
|
|
def extrac(text): |
|
text = re.sub("<[^>]*>","",text) |
|
result_list = re.split(r'\n', text) |
|
final_list = [] |
|
for i in result_list: |
|
if is_english(i): |
|
i = romajitable.to_kana(i).katakana |
|
i = i.replace('\n','').replace(' ','') |
|
|
|
if len(i)>1: |
|
if len(i) > 50: |
|
try: |
|
cur_list = re.split(r'。|!', i) |
|
for i in cur_list: |
|
if len(i)>1: |
|
final_list.append(i+'。') |
|
except: |
|
pass |
|
else: |
|
final_list.append(i) |
|
''' |
|
final_list.append(i) |
|
''' |
|
final_list = [x for x in final_list if x != ''] |
|
print(final_list) |
|
return final_list |
|
|
|
def to_numpy(tensor: torch.Tensor): |
|
return tensor.detach().cpu().numpy() if tensor.requires_grad \ |
|
else tensor.detach().numpy() |
|
|
|
def chatgpt(text): |
|
messages = [] |
|
try: |
|
if text != 'exist': |
|
with open('log.pickle', 'rb') as f: |
|
messages = pickle.load(f) |
|
messages.append({"role": "user", "content": text},) |
|
chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages) |
|
reply = chat.choices[0].message.content |
|
messages.append({"role": "assistant", "content": reply}) |
|
print(messages[-1]) |
|
if len(messages) == 12: |
|
messages[6:10] = messages[8:] |
|
del messages[-2:] |
|
with open('log.pickle', 'wb') as f: |
|
pickle.dump(messages, f) |
|
return reply |
|
except: |
|
messages.append({"role": "user", "content": text},) |
|
chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages) |
|
reply = chat.choices[0].message.content |
|
messages.append({"role": "assistant", "content": reply}) |
|
print(messages[-1]) |
|
if len(messages) == 12: |
|
messages[6:10] = messages[8:] |
|
del messages[-2:] |
|
with open('log.pickle', 'wb') as f: |
|
pickle.dump(messages, f) |
|
return reply |
|
|
|
def get_symbols_from_json(path): |
|
assert os.path.isfile(path) |
|
with open(path, 'r') as f: |
|
data = json.load(f) |
|
return data['symbols'] |
|
|
|
def sle(language,text): |
|
text = text.replace('\n', '').replace('\r', '').replace(" ", "") |
|
if language == "中文": |
|
tts_input1 = "[ZH]" + text + "[ZH]" |
|
return tts_input1 |
|
elif language == "自动": |
|
tts_input1 = f"[JA]{text}[JA]" if is_japanese(text) else f"[ZH]{text}[ZH]" |
|
return tts_input1 |
|
elif language == "日文": |
|
tts_input1 = "[JA]" + text + "[JA]" |
|
return tts_input1 |
|
elif language == "英文": |
|
tts_input1 = "[EN]" + text + "[EN]" |
|
return tts_input1 |
|
elif language == "手动": |
|
return text |
|
|
|
def get_text(text,hps_ms): |
|
text_norm = text_to_sequence(text,hps_ms.symbols,hps_ms.data.text_cleaners) |
|
if hps_ms.data.add_blank: |
|
text_norm = commons.intersperse(text_norm, 0) |
|
text_norm = torch.LongTensor(text_norm) |
|
return text_norm |
|
|
|
def create_tts_fn(net_g,hps,speaker_id): |
|
speaker_id = int(speaker_id) |
|
def tts_fn(is_transfer,original_speaker, target_speaker,history,is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ): |
|
text = check_text(text) |
|
repeat_time = int(repeat_time) |
|
original_speaker_id = selection(original_speaker) |
|
target_speaker_id = selection(target_speaker) |
|
if is_gpt: |
|
openai.api_key = api_key |
|
text = chatgpt(text) |
|
history[-1][1] = text |
|
if not extract: |
|
print(text) |
|
t1 = time.time() |
|
stn_tst = get_text(sle(language,text),hps) |
|
with torch.no_grad(): |
|
x_tst = stn_tst.unsqueeze(0).to(dev) |
|
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) |
|
sid = torch.LongTensor([speaker_id]).to(dev) |
|
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() |
|
t2 = time.time() |
|
spending_time = "推理时间为:"+str(t2-t1)+"s" |
|
print(spending_time) |
|
file_path = "subtitles.srt" |
|
try: |
|
write(audiopath + '.wav',22050,audio) |
|
if is_audio: |
|
for i in range(repeat_time): |
|
cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i)) |
|
os.system(cmd) |
|
except: |
|
pass |
|
return history,file_path,(hps.data.sampling_rate,audio) |
|
else: |
|
a = ['【','[','(','('] |
|
b = ['】',']',')',')'] |
|
for i in a: |
|
text = text.replace(i,'<') |
|
for i in b: |
|
text = text.replace(i,'>') |
|
final_list = extrac(text.replace('“','').replace('”','')) |
|
split_list = [] |
|
while len(final_list) > 0: |
|
split_list.append(final_list[:500]) |
|
final_list = final_list[500:] |
|
c0 = 0 |
|
for lists in split_list: |
|
audio_fin = [] |
|
t = datetime.timedelta(seconds=0) |
|
c = 0 |
|
f1 = open(audiopath.replace('.wav',str(c0)+".srt"),'w',encoding='utf-8') |
|
for sentence in lists: |
|
try: |
|
c +=1 |
|
stn_tst = get_text(sle(language,sentence),hps) |
|
with torch.no_grad(): |
|
x_tst = stn_tst.unsqueeze(0).to(dev) |
|
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) |
|
sid = torch.LongTensor([original_speaker_id]).to(dev) |
|
t1 = time.time() |
|
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() |
|
t2 = time.time() |
|
spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s" |
|
print(spending_time) |
|
time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3] |
|
last_time = datetime.timedelta(seconds=len(audio)/float(22050)) |
|
t+=last_time |
|
time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3] |
|
print(time_end) |
|
f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n') |
|
if is_transfer: |
|
with torch.no_grad(): |
|
y = torch.FloatTensor(audio) |
|
y = y / max(-y.min(), y.max()) / 0.99 |
|
y = y.to(dev) |
|
y = y.unsqueeze(0) |
|
spec = spectrogram_torch(y, hps.data.filter_length, |
|
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, |
|
center=False).to(dev) |
|
spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev) |
|
sid_src = torch.LongTensor([original_speaker_id]).to(dev) |
|
sid_tgt = torch.LongTensor([target_speaker_id]).to(dev) |
|
audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][ |
|
0, 0].data.cpu().float().numpy() |
|
del y, spec, spec_lengths, sid_src, sid_tgt |
|
audio_fin.append(audio) |
|
except: |
|
pass |
|
write(audiopath.replace('.wav',str(c0)+'.wav'),22050,np.concatenate(audio_fin)) |
|
c0 += 1 |
|
file_path = audiopath.replace('.wav',str(c0)+".srt") |
|
return history,file_path,(hps.data.sampling_rate, np.concatenate(audio_fin)) |
|
return tts_fn |
|
|
|
def create_vc_fn(net_g,hps): |
|
def vc_fn(text,language,n_scale,n_scale_w,l_scale,original_speaker, target_speaker, record_audio, upload_audio): |
|
input_audio = record_audio if record_audio is not None else upload_audio |
|
original_speaker_id = selection(original_speaker) |
|
target_speaker_id = selection(target_speaker) |
|
if input_audio is None: |
|
stn_tst = get_text(sle(language,text),hps) |
|
with torch.no_grad(): |
|
x_tst = stn_tst.unsqueeze(0).to(dev) |
|
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) |
|
sid = torch.LongTensor([original_speaker_id]).to(dev) |
|
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() |
|
sampling_rate = hps.data.sampling_rate |
|
else: |
|
sampling_rate, audio = input_audio |
|
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) |
|
if len(audio.shape) > 1: |
|
audio = librosa.to_mono(audio.transpose(1, 0)) |
|
if sampling_rate != hps.data.sampling_rate: |
|
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate) |
|
with torch.no_grad(): |
|
y = torch.FloatTensor(audio) |
|
y = y / max(-y.min(), y.max()) / 0.99 |
|
y = y.to(dev) |
|
y = y.unsqueeze(0) |
|
spec = spectrogram_torch(y, hps.data.filter_length, |
|
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, |
|
center=False).to(dev) |
|
spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev) |
|
sid_src = torch.LongTensor([original_speaker_id]).to(dev) |
|
sid_tgt = torch.LongTensor([target_speaker_id]).to(dev) |
|
audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][ |
|
0, 0].data.cpu().float().numpy() |
|
del y, spec, spec_lengths, sid_src, sid_tgt |
|
return "Success", (hps.data.sampling_rate, audio) |
|
return vc_fn |
|
|
|
def bot(history,user_message): |
|
return history + [[check_text(user_message), None]] |
|
|
|
def selection(speaker): |
|
if speaker == "高咲侑": |
|
spk = 0 |
|
return spk |
|
|
|
elif speaker == "歩夢": |
|
spk = 1 |
|
return spk |
|
|
|
elif speaker == "かすみ": |
|
spk = 2 |
|
return spk |
|
|
|
elif speaker == "しずく": |
|
spk = 3 |
|
return spk |
|
|
|
elif speaker == "果林": |
|
spk = 4 |
|
return spk |
|
|
|
elif speaker == "愛": |
|
spk = 5 |
|
return spk |
|
|
|
elif speaker == "彼方": |
|
spk = 6 |
|
return spk |
|
|
|
elif speaker == "せつ菜": |
|
spk = 7 |
|
return spk |
|
|
|
elif speaker == "エマ": |
|
spk = 8 |
|
return spk |
|
|
|
elif speaker == "璃奈": |
|
spk = 9 |
|
return spk |
|
|
|
elif speaker == "栞子": |
|
spk = 10 |
|
return spk |
|
|
|
elif speaker == "ランジュ": |
|
spk = 11 |
|
return spk |
|
|
|
elif speaker == "ミア": |
|
spk = 12 |
|
return spk |
|
|
|
elif speaker == "派蒙": |
|
spk = 16 |
|
return spk |
|
|
|
elif speaker == "c1": |
|
spk = 18 |
|
return spk |
|
|
|
elif speaker == "c2": |
|
spk = 19 |
|
return spk |
|
|
|
elif speaker == "華恋": |
|
spk = 21 |
|
return spk |
|
|
|
elif speaker == "まひる": |
|
spk = 22 |
|
return spk |
|
|
|
elif speaker == "なな": |
|
spk = 23 |
|
return spk |
|
|
|
elif speaker == "クロディーヌ": |
|
spk = 24 |
|
return spk |
|
|
|
elif speaker == "ひかり": |
|
spk = 25 |
|
return spk |
|
|
|
elif speaker == "純那": |
|
spk = 26 |
|
return spk |
|
|
|
elif speaker == "香子": |
|
spk = 27 |
|
return spk |
|
|
|
elif speaker == "真矢": |
|
spk = 28 |
|
return spk |
|
|
|
elif speaker == "双葉": |
|
spk = 29 |
|
return spk |
|
|
|
elif speaker == "ミチル": |
|
spk = 30 |
|
return spk |
|
|
|
elif speaker == "メイファン": |
|
spk = 31 |
|
return spk |
|
|
|
elif speaker == "やちよ": |
|
spk = 32 |
|
return spk |
|
|
|
elif speaker == "晶": |
|
spk = 33 |
|
return spk |
|
|
|
elif speaker == "いちえ": |
|
spk = 34 |
|
return spk |
|
|
|
elif speaker == "ゆゆ子": |
|
spk = 35 |
|
return spk |
|
|
|
elif speaker == "塁": |
|
spk = 36 |
|
return spk |
|
|
|
elif speaker == "珠緒": |
|
spk = 37 |
|
return spk |
|
|
|
elif speaker == "あるる": |
|
spk = 38 |
|
return spk |
|
|
|
elif speaker == "ララフィン": |
|
spk = 39 |
|
return spk |
|
|
|
elif speaker == "美空": |
|
spk = 40 |
|
return spk |
|
|
|
elif speaker == "静羽": |
|
spk = 41 |
|
return spk |
|
|
|
else: |
|
return 0 |
|
|
|
def check_text(input): |
|
if isinstance(input, str): |
|
return input |
|
else: |
|
with open(input.name, "r", encoding="utf-8") as f: |
|
return f.read() |
|
|
|
if __name__ == '__main__': |
|
hps = utils.get_hparams_from_file('checkpoints/tmp/config.json') |
|
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
|
models = [] |
|
schools_list = ["ShojoKageki-Nijigasaki","ShojoKageki","Nijigasaki"] |
|
schools = [] |
|
lan = ["中文","日文","自动","手动"] |
|
with open("checkpoints/info.json", "r", encoding="utf-8") as f: |
|
models_info = json.load(f) |
|
for i in models_info: |
|
checkpoint = models_info[i]["checkpoint"] |
|
phone_dict = { |
|
symbol: i for i, symbol in enumerate(symbols) |
|
} |
|
net_g = SynthesizerTrn( |
|
len(symbols), |
|
hps.data.filter_length // 2 + 1, |
|
hps.train.segment_size // hps.data.hop_length, |
|
n_speakers=hps.data.n_speakers, |
|
**hps.model).to(dev) |
|
_ = net_g.eval() |
|
_ = utils.load_checkpoint(checkpoint, net_g) |
|
school = models_info[i] |
|
speakers = school["speakers"] |
|
content = [] |
|
for j in speakers: |
|
sid = int(speakers[j]['sid']) |
|
title = school |
|
example = speakers[j]['speech'] |
|
name = speakers[j]["name"] |
|
content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid))) |
|
models.append(content) |
|
schools.append((i,create_vc_fn(net_g,hps))) |
|
with gr.Blocks() as app: |
|
with gr.Tabs(): |
|
for (i,vc_fn) in schools: |
|
with gr.TabItem(i): |
|
idols = ["派蒙"] |
|
for (sid, name, title, example, tts_fn) in models[schools_list.index(i)]: |
|
idols.append(name) |
|
with gr.TabItem(name): |
|
with gr.Column(): |
|
with gr.Row(): |
|
with gr.Row(): |
|
gr.Markdown( |
|
'<div align="center">' |
|
f'<img style="width:auto;height:400px;" src="file/image/{name}.png">' |
|
'</div>' |
|
) |
|
chatbot = gr.Chatbot() |
|
with gr.Row(): |
|
with gr.Column(scale=0.85): |
|
input1 = gr.TextArea(label="Text", value=example,lines = 1) |
|
with gr.Column(scale=0.15, min_width=0): |
|
btnVC = gr.Button("Send") |
|
output1 = gr.Audio(label="采样率22050") |
|
with gr.Accordion(label="Setting", open=False): |
|
input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True) |
|
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6) |
|
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668) |
|
input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1) |
|
with gr.Accordion(label="Advanced Setting", open=False): |
|
audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True) |
|
api_input1 = gr.Checkbox(value=False, label="接入chatgpt") |
|
api_input2 = gr.TextArea(label="api-key",lines=1,value = '懂得都懂') |
|
with gr.Accordion(label="Advanced Setting", open=False): |
|
output2 = gr.outputs.File(label="字幕文件:subtitles.srt") |
|
audio_input1 = gr.Checkbox(value=False, label="保存路径") |
|
audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/path/to/live2d/sounds/temp.wav') |
|
input3 = gr.Checkbox(value=False, label="长句切割(小说合成)") |
|
inputxt = gr.File(label="Text") |
|
is_transfer = gr.Checkbox(value=False, label="是否声线转化") |
|
source_speaker = gr.Dropdown(choices=idols, value=name, label="source speaker") |
|
target_speaker = gr.Dropdown(choices=idols, value=name, label="target speaker") |
|
btnbook = gr.Button("小说合成") |
|
btnVC.click(bot, inputs = [chatbot,input1], outputs = [chatbot]).then( |
|
tts_fn, inputs=[is_transfer,source_speaker,target_speaker,chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1] |
|
) |
|
btnbook.click(bot, inputs = [chatbot,inputxt], outputs = [chatbot]).then( |
|
tts_fn, inputs=[is_transfer,source_speaker,target_speaker,chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1] |
|
) |
|
with gr.Tab("Voice Conversion(类似sovits)"): |
|
gr.Markdown(""" |
|
声线转化,使用模型中的说话人作为音源时效果更佳 |
|
""") |
|
with gr.Column(): |
|
with gr.Accordion(label="方法1:录制或上传声音,可进行歌声合成", open=False): |
|
record_audio = gr.Audio(label="record your voice", source="microphone") |
|
upload_audio = gr.Audio(label="or upload audio here", source="upload") |
|
with gr.Accordion(label="方法2:由原说话人先进行tts后套娃,适用于合成中文等特殊场景", open=True): |
|
text = gr.TextArea(label="Text", value='由源说话人进行语音转化',lines = 1) |
|
language = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True) |
|
n_scale = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6) |
|
n_scale_w = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668) |
|
l_scale = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1.1) |
|
source_speaker = gr.Dropdown(choices=idols, value=idols[-2], label="source speaker") |
|
target_speaker = gr.Dropdown(choices=idols, value=idols[-3], label="target speaker") |
|
with gr.Column(): |
|
message_box = gr.Textbox(label="Message") |
|
converted_audio = gr.Audio(label='converted audio') |
|
btn = gr.Button("Convert!") |
|
btn.click(vc_fn, inputs=[text,language,n_scale,n_scale_w,l_scale,source_speaker, target_speaker, record_audio, upload_audio], |
|
outputs=[message_box, converted_audio]) |
|
app.launch() |
|
|