import romajitable import re import numpy as np import logging logging.getLogger('numba').setLevel(logging.WARNING) import IPython.display as ipd import torch import commons import utils from models import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence import gradio as gr import time import datetime import os class VitsGradio: def __init__(self): self.dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.lan = ["中文","日文","自动","手动"] self.idols = ["歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","華恋","まひる","なな","クロディーヌ","ひかり",'純那',"香子","真矢","双葉","ミチル","メイファン","やちよ","晶","いちえ","ゆゆ子","塁","珠緒","あるる","ララフィン","美空","静羽"] self.modelPaths = [] for root,dirs,files in os.walk("checkpoints"): for dir in dirs: self.modelPaths.append(dir) with gr.Blocks() as self.Vits: gr.Markdown( "##
LoveLive!虹咲学园学园偶像同好会-少女☆歌劇 Vits\n" "###
请不要生成会对个人以及企划造成侵害的内容\n" "
这是一个实时更新的仓库,目前结束训练有标贝普通话版(biaobei),去标贝版(default),以及少歌部分角色版(ShojoKageki),正在训练中的全员版(tmp)。
" '
参数说明:由于手游中提取的语音过于有感情,建议将噪声比例调节至0.2-0.3区间,ShojoKageki模型则可以尝试默认的0.667;噪声偏差对应着每个字之间的间隔,对普通话影响较大,建议0.6-0.8;duration代表整体语速
' '
合成前请先选择模型,否则第一次合成不一定成功。长段落/小说合成建议colab或本地运行
') with gr.Tab("TTS合成"): with gr.Row(): with gr.Column(): with gr.Row(): with gr.Column(): input1 = gr.TextArea(label="Text", value="为什么你会那么熟练啊?你和雪菜亲过多少次了") input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True) input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True) btnVC = gr.Button("Submit") with gr.Column(): input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267) input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7) input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1) output1 = gr.Audio(label="采样率22050") btnVC.click(self.infer, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1]) with gr.Tab("选择模型"): with gr.Column(): modelstrs = gr.Dropdown(label = "模型", choices = self.modelPaths, value = self.modelPaths[0], type = "value") btnMod = gr.Button("载入模型") statusa = gr.TextArea() btnMod.click(self.loadCk, inputs=[modelstrs], outputs = [statusa]) with gr.Tab("小说合成(带字幕)"): with gr.Row(): with gr.Column(): with gr.Row(): with gr.Column(): input1 = gr.TextArea(label="建议colab或本地克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了") input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True) input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True) btnVC = gr.Button("Submit") with gr.Column(): input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267) input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7) input6 = gr.Slider(minimum=0.1, maximum=10, label="Duration", value=1) output1 = gr.Audio(label="采样率22050") subtitle = gr.outputs.File(label="字幕文件:subtitles.srt") btnVC.click(self.infer2, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1,subtitle]) def loadCk(self,path): self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json") self.net_g = SynthesizerTrn( len(symbols), self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, **self.hps.model).to(self.dev) _ = self.net_g.eval() _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.net_g) return "success" def get_text(self,text): text_norm = text_to_sequence(text,self.hps.data.text_cleaners) if self.hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm def is_japanese(self,string): for ch in string: if ord(ch) > 0x3040 and ord(ch) < 0x30FF: return True return False def is_english(self,string): import re pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$') if pattern.fullmatch(string): return True else: return False def selection(self,speaker): if speaker == "高咲侑": spk = 0 return spk elif speaker == "歩夢": spk = 1 return spk elif speaker == "かすみ": spk = 2 return spk elif speaker == "しずく": spk = 3 return spk elif speaker == "果林": spk = 4 return spk elif speaker == "愛": spk = 5 return spk elif speaker == "彼方": spk = 6 return spk elif speaker == "せつ菜": spk = 7 return spk elif speaker == "エマ": spk = 8 return spk elif speaker == "璃奈": spk = 9 return spk elif speaker == "栞子": spk = 10 return spk elif speaker == "ランジュ": spk = 11 return spk elif speaker == "ミア": spk = 12 return spk elif speaker == "派蒙": spk = 16 return spk elif speaker == "華恋": spk = 21 return spk elif speaker == "まひる": spk = 22 return spk elif speaker == "なな": spk = 23 return spk elif speaker == "クロディーヌ": spk = 24 return spk elif speaker == "ひかり": spk = 25 return spk elif speaker == "純那": spk = 26 return spk elif speaker == "香子": spk = 27 return spk elif speaker == "真矢": spk = 28 return spk elif speaker == "双葉": spk = 29 return spk elif speaker == "ミチル": spk = 30 return spk elif speaker == "メイファン": spk = 31 return spk elif speaker == "やちよ": spk = 32 return spk elif speaker == "晶": spk = 33 return spk elif speaker == "いちえ": spk = 34 return spk elif speaker == "ゆゆ子": spk = 35 return spk elif speaker == "塁": spk = 36 return spk elif speaker == "珠緒": spk = 37 return spk elif speaker == "あるる": spk = 38 return spk elif speaker == "ララフィン": spk = 39 return spk elif speaker == "美空": spk = 40 return spk elif speaker == "静羽": spk = 41 return spk else: return 0 def sle(self,language,text): text = text.replace('\n','。').replace(' ',',') if language == "中文": tts_input1 = "[ZH]" + text + "[ZH]" return tts_input1 elif language == "自动": tts_input1 = f"[JA]{text}[JA]" if self.is_japanese(text) else f"[ZH]{text}[ZH]" return tts_input1 elif language == "日文": tts_input1 = "[JA]" + text + "[JA]" return tts_input1 elif language == "英文": tts_input1 = "[EN]" + text + "[EN]" return tts_input1 elif language == "手动": return text def extrac(self,text): text = re.sub("<[^>]*>","",text) result_list = re.split(r'\n', text) final_list = [] for i in result_list: if self.is_english(i): i = romajitable.to_kana(i).katakana i = i.replace('\n','').replace(' ','') #Current length of single sentence: 20 if len(i)>1: if len(i) > 20: try: cur_list = re.split(r'。|!', i) for i in cur_list: if len(i)>1: final_list.append(i+'。') except: pass else: final_list.append(i) final_list = [x for x in final_list if x != ''] print(final_list) return final_list def infer(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1): try: speaker_id = int(self.selection(speaker_id)) t1 = time.time() stn_tst = self.get_text(self.sle(language,text)) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(self.dev) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev) sid = torch.LongTensor([speaker_id]).to(self.dev) audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() t2 = time.time() spending_time = "推理时间为:"+str(t2-t1)+"s" print(spending_time) return (self.hps.data.sampling_rate, audio) except: self.hps = utils.get_hparams_from_file(f"checkpoints/biaobei/config.json") self.net_g = SynthesizerTrn( len(symbols), self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, **self.hps.model).to(self.dev) _ = self.net_g.eval() _ = utils.load_checkpoint(f"checkpoints/biaobei/model.pth", self.net_g) def infer2(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1): speaker_id = int(self.selection(speaker_id)) a = ['【','[','(','('] b = ['】',']',')',')'] for i in a: text = text.replace(i,'<') for i in b: text = text.replace(i,'>') final_list = self.extrac(text.replace('“','').replace('”','')) audio_fin = [] c = 0 t = datetime.timedelta(seconds=0) f1 = open("subtitles.srt",'w',encoding='utf-8') for sentence in final_list: c +=1 stn_tst = self.get_text(self.sle(language,text)) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(self.dev) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev) sid = torch.LongTensor([speaker_id]).to(self.dev) t1 = time.time() audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() t2 = time.time() spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s" print(spending_time) time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3] last_time = datetime.timedelta(seconds=len(audio)/float(22050)) t+=last_time time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3] print(time_end) f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n') audio_fin.append(audio) file_path = "subtitles.srt" return (self.hps.data.sampling_rate, np.concatenate(audio_fin)),file_path print("开始部署") grVits = VitsGradio() grVits.Vits.launch()