import logging logging.getLogger('numba').setLevel(logging.WARNING) logging.getLogger('matplotlib').setLevel(logging.WARNING) logging.getLogger('urllib3').setLevel(logging.WARNING) import json import re import numpy as np import IPython.display as ipd import torch import commons import utils from models import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence import gradio as gr import time import datetime import os import pickle import openai from scipy.io.wavfile import write import librosa import romajitable from mel_processing import spectrogram_torch def is_japanese(string): for ch in string: if ord(ch) > 0x3040 and ord(ch) < 0x30FF: return True return False def is_english(string): import re pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$') if pattern.fullmatch(string): return True else: return False def extrac(text): text = re.sub("<[^>]*>","",text) result_list = re.split(r'\n', text) final_list = [] for i in result_list: if is_english(i): i = romajitable.to_kana(i).katakana i = i.replace('\n','').replace(' ','') #Current length of single sentence: 20 if len(i)>1: if len(i) > 50: try: cur_list = re.split(r'。|!', i) for i in cur_list: if len(i)>1: final_list.append(i+'。') except: pass else: final_list.append(i) ''' final_list.append(i) ''' final_list = [x for x in final_list if x != ''] print(final_list) return final_list def to_numpy(tensor: torch.Tensor): return tensor.detach().cpu().numpy() if tensor.requires_grad \ else tensor.detach().numpy() def chatgpt(text): messages = [] try: if text != 'exist': with open('log.pickle', 'rb') as f: messages = pickle.load(f) messages.append({"role": "user", "content": text},) chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages) reply = chat.choices[0].message.content messages.append({"role": "assistant", "content": reply}) print(messages[-1]) if len(messages) == 12: messages[6:10] = messages[8:] del messages[-2:] with open('log.pickle', 'wb') as f: pickle.dump(messages, f) return reply except: messages.append({"role": "user", "content": text},) chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages) reply = chat.choices[0].message.content messages.append({"role": "assistant", "content": reply}) print(messages[-1]) if len(messages) == 12: messages[6:10] = messages[8:] del messages[-2:] with open('log.pickle', 'wb') as f: pickle.dump(messages, f) return reply def get_symbols_from_json(path): assert os.path.isfile(path) with open(path, 'r') as f: data = json.load(f) return data['symbols'] def sle(language,text): text = text.replace('\n', '').replace('\r', '').replace(" ", "") if language == "中文": tts_input1 = "[ZH]" + text + "[ZH]" return tts_input1 elif language == "自动": tts_input1 = f"[JA]{text}[JA]" if is_japanese(text) else f"[ZH]{text}[ZH]" return tts_input1 elif language == "日文": tts_input1 = "[JA]" + text + "[JA]" return tts_input1 elif language == "英文": tts_input1 = "[EN]" + text + "[EN]" return tts_input1 elif language == "手动": return text def get_text(text,hps_ms): text_norm = text_to_sequence(text,hps_ms.symbols,hps_ms.data.text_cleaners) if hps_ms.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm def create_tts_fn(net_g,hps,speaker_id): speaker_id = int(speaker_id) def tts_fn(is_transfer,original_speaker, target_speaker,history,is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ): text = check_text(text) repeat_time = int(repeat_time) original_speaker_id = selection(original_speaker) target_speaker_id = selection(target_speaker) if is_gpt: openai.api_key = api_key text = chatgpt(text) history[-1][1] = text if not extract: print(text) t1 = time.time() stn_tst = get_text(sle(language,text),hps) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(dev) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) sid = torch.LongTensor([speaker_id]).to(dev) audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() t2 = time.time() spending_time = "推理时间为:"+str(t2-t1)+"s" print(spending_time) file_path = "subtitles.srt" try: write(audiopath + '.wav',22050,audio) if is_audio: for i in range(repeat_time): cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i)) os.system(cmd) except: pass return history,file_path,(hps.data.sampling_rate,audio) else: a = ['【','[','(','('] b = ['】',']',')',')'] for i in a: text = text.replace(i,'<') for i in b: text = text.replace(i,'>') final_list = extrac(text.replace('“','').replace('”','')) split_list = [] while len(final_list) > 0: split_list.append(final_list[:500]) final_list = final_list[500:] c0 = 0 for lists in split_list: audio_fin = [] t = datetime.timedelta(seconds=0) c = 0 f1 = open(audiopath.replace('.wav',str(c0)+".srt"),'w',encoding='utf-8') for sentence in lists: try: c +=1 stn_tst = get_text(sle(language,sentence),hps) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(dev) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) sid = torch.LongTensor([original_speaker_id]).to(dev) t1 = time.time() audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() t2 = time.time() spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s" print(spending_time) time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3] last_time = datetime.timedelta(seconds=len(audio)/float(22050)) t+=last_time time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3] print(time_end) f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n') if is_transfer: with torch.no_grad(): y = torch.FloatTensor(audio) y = y / max(-y.min(), y.max()) / 0.99 y = y.to(dev) y = y.unsqueeze(0) spec = spectrogram_torch(y, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False).to(dev) spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev) sid_src = torch.LongTensor([original_speaker_id]).to(dev) sid_tgt = torch.LongTensor([target_speaker_id]).to(dev) audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][ 0, 0].data.cpu().float().numpy() del y, spec, spec_lengths, sid_src, sid_tgt audio_fin.append(audio) except: pass write(audiopath.replace('.wav',str(c0)+'.wav'),22050,np.concatenate(audio_fin)) c0 += 1 file_path = audiopath.replace('.wav',str(c0)+".srt") return history,file_path,(hps.data.sampling_rate, np.concatenate(audio_fin)) return tts_fn def create_vc_fn(net_g,hps): def vc_fn(text,language,n_scale,n_scale_w,l_scale,original_speaker, target_speaker, record_audio, upload_audio): input_audio = record_audio if record_audio is not None else upload_audio original_speaker_id = selection(original_speaker) target_speaker_id = selection(target_speaker) if input_audio is None: stn_tst = get_text(sle(language,text),hps) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(dev) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) sid = torch.LongTensor([original_speaker_id]).to(dev) audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() sampling_rate = hps.data.sampling_rate else: sampling_rate, audio = input_audio audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) if sampling_rate != hps.data.sampling_rate: audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate) with torch.no_grad(): y = torch.FloatTensor(audio) y = y / max(-y.min(), y.max()) / 0.99 y = y.to(dev) y = y.unsqueeze(0) spec = spectrogram_torch(y, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False).to(dev) spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev) sid_src = torch.LongTensor([original_speaker_id]).to(dev) sid_tgt = torch.LongTensor([target_speaker_id]).to(dev) audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][ 0, 0].data.cpu().float().numpy() del y, spec, spec_lengths, sid_src, sid_tgt return "Success", (hps.data.sampling_rate, audio) return vc_fn def bot(history,user_message): return history + [[check_text(user_message), None]] def selection(speaker): if speaker == "高咲侑": spk = 0 return spk elif speaker == "歩夢": spk = 1 return spk elif speaker == "かすみ": spk = 2 return spk elif speaker == "しずく": spk = 3 return spk elif speaker == "果林": spk = 4 return spk elif speaker == "愛": spk = 5 return spk elif speaker == "彼方": spk = 6 return spk elif speaker == "せつ菜": spk = 7 return spk elif speaker == "エマ": spk = 8 return spk elif speaker == "璃奈": spk = 9 return spk elif speaker == "栞子": spk = 10 return spk elif speaker == "ランジュ": spk = 11 return spk elif speaker == "ミア": spk = 12 return spk elif speaker == "派蒙": spk = 16 return spk elif speaker == "c1": spk = 18 return spk elif speaker == "c2": spk = 19 return spk elif speaker == "華恋": spk = 21 return spk elif speaker == "まひる": spk = 22 return spk elif speaker == "なな": spk = 23 return spk elif speaker == "クロディーヌ": spk = 24 return spk elif speaker == "ひかり": spk = 25 return spk elif speaker == "純那": spk = 26 return spk elif speaker == "香子": spk = 27 return spk elif speaker == "真矢": spk = 28 return spk elif speaker == "双葉": spk = 29 return spk elif speaker == "ミチル": spk = 30 return spk elif speaker == "メイファン": spk = 31 return spk elif speaker == "やちよ": spk = 32 return spk elif speaker == "晶": spk = 33 return spk elif speaker == "いちえ": spk = 34 return spk elif speaker == "ゆゆ子": spk = 35 return spk elif speaker == "塁": spk = 36 return spk elif speaker == "珠緒": spk = 37 return spk elif speaker == "あるる": spk = 38 return spk elif speaker == "ララフィン": spk = 39 return spk elif speaker == "美空": spk = 40 return spk elif speaker == "静羽": spk = 41 return spk else: return 0 def check_text(input): if isinstance(input, str): return input else: with open(input.name, "r", encoding="utf-8") as f: return f.read() if __name__ == '__main__': hps = utils.get_hparams_from_file('checkpoints/tmp/config.json') dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") models = [] schools_list = ["ShojoKageki-Nijigasaki","ShojoKageki","Nijigasaki"] schools = [] lan = ["中文","日文","自动","手动"] with open("checkpoints/info.json", "r", encoding="utf-8") as f: models_info = json.load(f) for i in models_info: checkpoint = models_info[i]["checkpoint"] phone_dict = { symbol: i for i, symbol in enumerate(symbols) } net_g = SynthesizerTrn( len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model).to(dev) _ = net_g.eval() _ = utils.load_checkpoint(checkpoint, net_g) school = models_info[i] speakers = school["speakers"] content = [] for j in speakers: sid = int(speakers[j]['sid']) title = school example = speakers[j]['speech'] name = speakers[j]["name"] content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid))) models.append(content) schools.append((i,create_vc_fn(net_g,hps))) with gr.Blocks() as app: with gr.Tabs(): for (i,vc_fn) in schools: with gr.TabItem(i): idols = ["派蒙"] for (sid, name, title, example, tts_fn) in models[schools_list.index(i)]: idols.append(name) with gr.TabItem(name): with gr.Column(): with gr.Row(): with gr.Row(): gr.Markdown( '
' f'' '
' ) chatbot = gr.Chatbot() with gr.Row(): with gr.Column(scale=0.85): input1 = gr.TextArea(label="Text", value=example,lines = 1) with gr.Column(scale=0.15, min_width=0): btnVC = gr.Button("Send") output1 = gr.Audio(label="采样率22050") with gr.Accordion(label="Setting", open=False): input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True) input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6) input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668) input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1) with gr.Accordion(label="Advanced Setting", open=False): audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True) api_input1 = gr.Checkbox(value=False, label="接入chatgpt") api_input2 = gr.TextArea(label="api-key",lines=1,value = '懂得都懂') with gr.Accordion(label="Advanced Setting", open=False): output2 = gr.outputs.File(label="字幕文件:subtitles.srt") audio_input1 = gr.Checkbox(value=False, label="保存路径") audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/path/to/live2d/sounds/temp.wav') input3 = gr.Checkbox(value=False, label="长句切割(小说合成)") inputxt = gr.File(label="Text") is_transfer = gr.Checkbox(value=False, label="是否声线转化") source_speaker = gr.Dropdown(choices=idols, value=name, label="source speaker") target_speaker = gr.Dropdown(choices=idols, value=name, label="target speaker") btnbook = gr.Button("小说合成") btnVC.click(bot, inputs = [chatbot,input1], outputs = [chatbot]).then( tts_fn, inputs=[is_transfer,source_speaker,target_speaker,chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1] ) btnbook.click(bot, inputs = [chatbot,inputxt], outputs = [chatbot]).then( tts_fn, inputs=[is_transfer,source_speaker,target_speaker,chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1] ) with gr.Tab("Voice Conversion(类似sovits)"): gr.Markdown(""" 声线转化,使用模型中的说话人作为音源时效果更佳 """) with gr.Column(): with gr.Accordion(label="方法1:录制或上传声音,可进行歌声合成", open=False): record_audio = gr.Audio(label="record your voice", source="microphone") upload_audio = gr.Audio(label="or upload audio here", source="upload") with gr.Accordion(label="方法2:由原说话人先进行tts后套娃,适用于合成中文等特殊场景", open=True): text = gr.TextArea(label="Text", value='由源说话人进行语音转化',lines = 1) language = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True) n_scale = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6) n_scale_w = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668) l_scale = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1.1) source_speaker = gr.Dropdown(choices=idols, value=idols[-2], label="source speaker") target_speaker = gr.Dropdown(choices=idols, value=idols[-3], label="target speaker") with gr.Column(): message_box = gr.Textbox(label="Message") converted_audio = gr.Audio(label='converted audio') btn = gr.Button("Convert!") btn.click(vc_fn, inputs=[text,language,n_scale,n_scale_w,l_scale,source_speaker, target_speaker, record_audio, upload_audio], outputs=[message_box, converted_audio]) app.launch()