import romajitable import re import numpy as np import logging logging.getLogger('numba').setLevel(logging.WARNING) import IPython.display as ipd import torch import commons import utils from models import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence import gradio as gr import time import datetime import os def get_text(text, hps): text_norm = text_to_sequence(text, symbols, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def selection(speaker): if speaker == "高咲侑": spk = 0 return spk elif speaker == "歩夢": spk = 1 return spk elif speaker == "かすみ": spk = 2 return spk elif speaker == "しずく": spk = 3 return spk elif speaker == "果林": spk = 4 return spk elif speaker == "愛": spk = 5 return spk elif speaker == "彼方": spk = 6 return spk elif speaker == "せつ菜": spk = 7 return spk elif speaker == "エマ": spk = 8 return spk elif speaker == "璃奈": spk = 9 return spk elif speaker == "栞子": spk = 10 return spk elif speaker == "ランジュ": spk = 11 return spk elif speaker == "ミア": spk = 12 return spk elif speaker == "三色绘恋1": spk = 13 return spk elif speaker == "三色绘恋2": spk = 15 elif speaker == "派蒙": spk = 16 return spk def is_japanese(string): for ch in string: if ord(ch) > 0x3040 and ord(ch) < 0x30FF: return True return False def is_english(string): import re pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$') if pattern.fullmatch(string): return True else: return False def sle(language,tts_input0): if language == "中文": tts_input1 = "[ZH]" + tts_input0.replace('\n','。').replace(' ',',') + "[ZH]" return tts_input1 if language == "自动": tts_input1 = f"[JA]{tts_input0}[JA]" if is_japanese(tts_input0) else f"[ZH]{tts_input0}[ZH]" return tts_input1 elif language == "日文": tts_input1 = "[JA]" + tts_input0.replace('\n','。').replace(' ',',') + "[JA]" return tts_input1 def extrac(text): text = re.sub("<[^>]*>","",text) result_list = re.split(r'\n', text) final_list = [] for i in result_list: if is_english(i): i = romajitable.to_kana(i).katakana i = i.replace('\n','').replace(' ','') #Current length of single sentence: 20 if len(i)>1: if len(i) > 20: try: cur_list = re.split(r'。|!', i) for i in cur_list: if len(i)>1: final_list.append(i+'。') except: pass else: final_list.append(i) final_list = [x for x in final_list if x != ''] print(final_list) return final_list def infer(text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1): speaker_id = int(selection(speaker_id)) a = ['【','[','(','('] b = ['】',']',')',')'] for i in a: text = text.replace(i,'<') for i in b: text = text.replace(i,'>') final_list = extrac(text.replace('“','').replace('”','')) audio_fin = [] c = 0 t = datetime.timedelta(seconds=0) f1 = open("subtitles.srt",'w',encoding='utf-8') for sentence in final_list: c +=1 stn_tst = get_text(sle(language,sentence), hps_ms) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(dev) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) sid = torch.LongTensor([speaker_id]).to(dev) t1 = time.time() audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() t2 = time.time() spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s" print(spending_time) time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3] last_time = datetime.timedelta(seconds=len(audio)/float(22050)) t+=last_time time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3] print(time_end) f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n') audio_fin.append(audio) file_path = "subtitles.srt" return (hps_ms.data.sampling_rate, np.concatenate(audio_fin)),file_path lan = ["中文","日文","自动"] idols = ["高咲侑","歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"] hps_ms = utils.get_hparams_from_file("lovelive/config.json") net_g_ms = SynthesizerTrn( len(symbols), hps_ms.data.filter_length // 2 + 1, hps_ms.train.segment_size // hps_ms.data.hop_length, n_speakers=hps_ms.data.n_speakers, **hps_ms.model).to(dev) _ = net_g_ms.eval() _ = utils.load_checkpoint("lovelive/G_936000.pth", net_g_ms) inputs = [gr.TextArea(label="如需实现快速合成,建议在colab上克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了?我想做只属于你一个人的学院偶像,所以,请只注视我一个人,好吗?【中文】\nなんでそんなに慣れてんだよっ?せつ菜と…何回キスしたんだよ?どこまであたしを置いてきぼりにすれば気が済むんだよ?[日文]\nI can't choose just one(English)"), gr.Dropdown(label="选择语言,目前勉强可以做到自动识别",choices=lan, value="自动", interactive=True), gr.Dropdown(label="选择说话人",choices=idols, value="歩夢", interactive=True), gr.Slider(minimum= 0,maximum=1.0,label="更改噪声比例,以控制情感", value=0.267), gr.Slider(minimum= 0,maximum=1.0,label="更改噪声偏差,以控制音素长短", value=0.7), gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1)] outputs=[gr.Audio(label="采样率22050"), gr.outputs.File(label="字幕文件:subtitles.srt")] iface = gr.Interface( fn=infer, inputs=inputs, outputs=outputs, title="Vits", description="虹团11人模型", ) iface.launch()