File size: 6,692 Bytes
5f25dab a7adc86 a36f6e8 43ef6e1 e822849 43ef6e1 a36f6e8 585a2ef 0839393 d1de440 a36f6e8 d1de440 d0831db d1de440 d220bdb d1de440 a7adc86 c434eb9 d220bdb d1de440 d220bdb a7adc86 d1de440 d220bdb d1de440 a7adc86 4019e19 a7adc86 35b4786 4019e19 8957401 0b7b2a0 a7adc86 42750a9 a7adc86 0b7b2a0 a7adc86 58dd714 a7adc86 c300a0f d1de440 b1a29ae 853d005 cfdd268 853d005 cfdd268 8c892cf a7adc86 05f8fbb 0839393 a7adc86 05f8fbb 0839393 05f8fbb 0839393 0b7b2a0 0d34cc5 a36f6e8 e822849 d1de440 1de5cf5 d1de440 468b67d c300a0f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import romajitable
import re
import numpy as np
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
import IPython.display as ipd
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import gradio as gr
import time
import datetime
import os
def get_text(text, hps):
text_norm = text_to_sequence(text, symbols, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def selection(speaker):
if speaker == "高咲侑":
spk = 0
return spk
elif speaker == "歩夢":
spk = 1
return spk
elif speaker == "かすみ":
spk = 2
return spk
elif speaker == "しずく":
spk = 3
return spk
elif speaker == "果林":
spk = 4
return spk
elif speaker == "愛":
spk = 5
return spk
elif speaker == "彼方":
spk = 6
return spk
elif speaker == "せつ菜":
spk = 7
return spk
elif speaker == "エマ":
spk = 8
return spk
elif speaker == "璃奈":
spk = 9
return spk
elif speaker == "栞子":
spk = 10
return spk
elif speaker == "ランジュ":
spk = 11
return spk
elif speaker == "ミア":
spk = 12
return spk
elif speaker == "三色绘恋1":
spk = 13
return spk
elif speaker == "三色绘恋2":
spk = 15
elif speaker == "派蒙":
spk = 16
return spk
def is_japanese(string):
for ch in string:
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
return True
return False
def is_english(string):
import re
pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
if pattern.fullmatch(string):
return True
else:
return False
def sle(language,tts_input0):
if language == "中文":
tts_input1 = "[ZH]" + tts_input0.replace('\n','。').replace(' ',',') + "[ZH]"
return tts_input1
if language == "自动":
tts_input1 = f"[JA]{tts_input0}[JA]" if is_japanese(tts_input0) else f"[ZH]{tts_input0}[ZH]"
return tts_input1
elif language == "日文":
tts_input1 = "[JA]" + tts_input0.replace('\n','。').replace(' ',',') + "[JA]"
return tts_input1
def extrac(text):
text = re.sub("<[^>]*>","",text)
result_list = re.split(r'\n', text)
final_list = []
for i in result_list:
if is_english(i):
i = romajitable.to_kana(i).katakana
i = i.replace('\n','').replace(' ','')
#Current length of single sentence: 20
if len(i)>1:
if len(i) > 20:
try:
cur_list = re.split(r'。|!', i)
for i in cur_list:
if len(i)>1:
final_list.append(i+'。')
except:
pass
else:
final_list.append(i)
final_list = [x for x in final_list if x != '']
print(final_list)
return final_list
def infer(text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
speaker_id = int(selection(speaker_id))
a = ['【','[','(','(']
b = ['】',']',')',')']
for i in a:
text = text.replace(i,'<')
for i in b:
text = text.replace(i,'>')
final_list = extrac(text.replace('“','').replace('”',''))
audio_fin = []
c = 0
t = datetime.timedelta(seconds=0)
content = []
for sentence in final_list:
c +=1
stn_tst = get_text(sle(language,sentence), hps_ms)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0).to(dev)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
sid = torch.LongTensor([speaker_id]).to(dev)
t1 = time.time()
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
t2 = time.time()
spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
print(spending_time)
time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
last_time = datetime.timedelta(seconds=len(audio)/float(22050))
t+=last_time
time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
print(time_end)
content.append(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
audio_fin.append(audio)
return (hps_ms.data.sampling_rate, np.concatenate(audio_fin)),content
lan = ["中文","日文","自动"]
idols = ["高咲侑","歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"]
hps_ms = utils.get_hparams_from_file("lovelive/config.json")
net_g_ms = SynthesizerTrn(
len(symbols),
hps_ms.data.filter_length // 2 + 1,
hps_ms.train.segment_size // hps_ms.data.hop_length,
n_speakers=hps_ms.data.n_speakers,
**hps_ms.model).to(dev)
_ = net_g_ms.eval()
_ = utils.load_checkpoint("lovelive/G_936000.pth", net_g_ms)
inputs = [gr.TextArea(label="如需实现快速合成,建议在colab上克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了?我想做只属于你一个人的学院偶像,所以,请只注视我一个人,好吗?【中文】\nなんでそんなに慣れてんだよっ?せつ菜と…何回キスしたんだよ?どこまであたしを置いてきぼりにすれば気が済むんだよ?[日文]\nI can't choose just one(English)"),
gr.Dropdown(label="选择语言,目前勉强可以做到自动识别",choices=lan, value="自动", interactive=True),
gr.Dropdown(label="选择说话人",choices=idols, value="歩夢", interactive=True),
gr.Slider(minimum= 0,maximum=1.0,label="更改噪声比例,以控制情感", value=0.267),
gr.Slider(minimum= 0,maximum=1.0,label="更改噪声偏差,以控制音素长短", value=0.7),
gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1)]
outputs=[gr.Audio(label="采样率22050"), gr.outputs.File(label="字幕文件:subtitles.srt")]
iface = gr.Interface(
fn=infer,
inputs=inputs,
outputs=outputs,
title="Vits",
description="虹团11人模型",
)
iface.launch() |