import romajitable
import re
import numpy as np
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
import IPython.display as ipd
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import gradio as gr
import time
import datetime
import os
class VitsGradio:
def __init__(self):
self.dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.lan = ["中文","日文","自动","手动"]
self.idols = ["歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"]
self.modelPaths = []
for root,dirs,files in os.walk("checkpoints"):
for dir in dirs:
self.modelPaths.append(dir)
with gr.Blocks() as self.Vits:
gr.Markdown(
"##
Lovelive虹团中日双语VITS\n"
"### 请不要生成会对个人以及企划造成侵害的内容\n"
"目前有标贝普通话版,去标贝版,少歌模型目前还是大饼状态
"
''
'')
with gr.Tab("TTS合成"):
with gr.Row():
with gr.Column():
with gr.Row():
with gr.Column():
input1 = gr.TextArea(label="Text", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
btnVC = gr.Button("Submit")
with gr.Column():
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
output1 = gr.Audio(label="采样率22050")
btnVC.click(self.infer, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1])
with gr.Tab("选择模型"):
with gr.Column():
modelstrs = gr.Dropdown(label = "模型", choices = self.modelPaths, value = self.modelPaths[0], type = "value")
btnMod = gr.Button("载入模型")
statusa = gr.TextArea()
btnMod.click(self.loadCk, inputs=[modelstrs], outputs = [statusa])
with gr.Tab("小说合成(带字幕)"):
with gr.Row():
with gr.Column():
with gr.Row():
with gr.Column():
input1 = gr.TextArea(label="建议colab或本地克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
btnVC = gr.Button("Submit")
with gr.Column():
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
input6 = gr.Slider(minimum=0.1, maximum=10, label="Duration", value=1)
output1 = gr.Audio(label="采样率22050")
subtitle = gr.outputs.File(label="字幕文件:subtitles.srt")
btnVC.click(self.infer2, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1,subtitle])
def loadCk(self,path):
self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
self.net_g = SynthesizerTrn(
len(symbols),
self.hps.data.filter_length // 2 + 1,
self.hps.train.segment_size // self.hps.data.hop_length,
n_speakers=self.hps.data.n_speakers,
**self.hps.model).to(self.dev)
_ = self.net_g.eval()
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.net_g)
return "success"
def get_text(self,text):
text_norm = text_to_sequence(text, self.hps.data.text_cleaners)
if self.hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def is_japanese(self,string):
for ch in string:
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
return True
return False
def is_english(self,string):
import re
pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
if pattern.fullmatch(string):
return True
else:
return False
def selection(self,speaker):
if speaker == "高咲侑":
spk = 0
return spk
elif speaker == "歩夢":
spk = 1
return spk
elif speaker == "かすみ":
spk = 2
return spk
elif speaker == "しずく":
spk = 3
return spk
elif speaker == "果林":
spk = 4
return spk
elif speaker == "愛":
spk = 5
return spk
elif speaker == "彼方":
spk = 6
return spk
elif speaker == "せつ菜":
spk = 7
return spk
elif speaker == "エマ":
spk = 8
return spk
elif speaker == "璃奈":
spk = 9
return spk
elif speaker == "栞子":
spk = 10
return spk
elif speaker == "ランジュ":
spk = 11
return spk
elif speaker == "ミア":
spk = 12
return spk
elif speaker == "派蒙":
spk = 16
return spk
def sle(self,language,text):
text = text.replace('\n','。').replace(' ',',')
if language == "中文":
tts_input1 = "[ZH]" + text + "[ZH]"
return tts_input1
elif language == "自动":
tts_input1 = f"[JA]{text}[JA]" if self.is_japanese(text) else f"[ZH]{text}[ZH]"
return tts_input1
elif language == "日文":
tts_input1 = "[JA]" + text + "[JA]"
return tts_input1
elif language == "英文":
tts_input1 = "[EN]" + text + "[EN]"
return tts_input1
elif language == "手动":
return text
def extrac(self,text):
text = re.sub("<[^>]*>","",text)
result_list = re.split(r'\n', text)
final_list = []
for i in result_list:
if self.is_english(i):
i = romajitable.to_kana(i).katakana
i = i.replace('\n','').replace(' ','')
#Current length of single sentence: 20
if len(i)>1:
if len(i) > 20:
try:
cur_list = re.split(r'。|!', i)
for i in cur_list:
if len(i)>1:
final_list.append(i+'。')
except:
pass
else:
final_list.append(i)
final_list = [x for x in final_list if x != '']
print(final_list)
return final_list
def infer(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
try:
speaker_id = int(self.selection(speaker_id))
t1 = time.time()
stn_tst = self.get_text(self.sle(language,text))
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0).to(self.dev)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
sid = torch.LongTensor([speaker_id]).to(self.dev)
audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
t2 = time.time()
spending_time = "推理时间为:"+str(t2-t1)+"s"
print(spending_time)
return (self.hps.data.sampling_rate, audio)
except:
self.hps = utils.get_hparams_from_file(f"checkpoints/biaobei/config.json")
self.net_g = SynthesizerTrn(
len(symbols),
self.hps.data.filter_length // 2 + 1,
self.hps.train.segment_size // self.hps.data.hop_length,
n_speakers=self.hps.data.n_speakers,
**self.hps.model).to(self.dev)
_ = self.net_g.eval()
_ = utils.load_checkpoint(f"checkpoints/biaobei/model.pth", self.net_g)
def infer2(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
speaker_id = int(self.selection(speaker_id))
a = ['【','[','(','(']
b = ['】',']',')',')']
for i in a:
text = text.replace(i,'<')
for i in b:
text = text.replace(i,'>')
final_list = self.extrac(text.replace('“','').replace('”',''))
audio_fin = []
c = 0
t = datetime.timedelta(seconds=0)
f1 = open("subtitles.srt",'w',encoding='utf-8')
for sentence in final_list:
c +=1
stn_tst = self.get_text(self.sle(language,text))
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0).to(self.dev)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
sid = torch.LongTensor([speaker_id]).to(self.dev)
t1 = time.time()
audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
t2 = time.time()
spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
print(spending_time)
time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
last_time = datetime.timedelta(seconds=len(audio)/float(22050))
t+=last_time
time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
print(time_end)
f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
audio_fin.append(audio)
file_path = "subtitles.srt"
return (self.hps.data.sampling_rate, np.concatenate(audio_fin)),file_path
grVits = VitsGradio()
grVits.Vits.launch()