File size: 6,755 Bytes
5f25dab
 
a7adc86
a36f6e8
 
43ef6e1
 
 
 
e822849
43ef6e1
 
a36f6e8
 
585a2ef
0839393
d1de440
a36f6e8
d1de440
 
 
 
d0831db
d1de440
d220bdb
d1de440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7adc86
 
 
 
c434eb9
 
 
 
 
 
 
 
d220bdb
d1de440
d220bdb
 
a7adc86
 
d1de440
 
d220bdb
d1de440
a7adc86
4019e19
a7adc86
 
 
35b4786
 
4019e19
8957401
0b7b2a0
a7adc86
 
42750a9
a7adc86
0b7b2a0
a7adc86
 
 
58dd714
 
a7adc86
 
 
c300a0f
d1de440
b1a29ae
 
853d005
cfdd268
853d005
cfdd268
8c892cf
a7adc86
 
05f8fbb
66db8e9
a7adc86
 
05f8fbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66db8e9
05f8fbb
66db8e9
 
0b7b2a0
0d34cc5
a36f6e8
e822849
d1de440
 
 
 
1de5cf5
d1de440
468b67d
c300a0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import romajitable
import re
import numpy as np
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
import IPython.display as ipd
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import gradio as gr
import time
import datetime
import os
def get_text(text, hps):
    text_norm = text_to_sequence(text, symbols, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def selection(speaker):
    if speaker == "高咲侑":
        spk = 0
        return spk

    elif speaker == "歩夢":
        spk = 1
        return spk

    elif speaker == "かすみ":
        spk = 2
        return spk

    elif speaker == "しずく":
        spk = 3
        return spk

    elif speaker == "果林":
        spk = 4
        return spk
    
    elif speaker == "愛":
        spk = 5
        return spk

    elif speaker == "彼方":
        spk = 6
        return spk

    elif speaker == "せつ菜":
        spk = 7
        return spk
    elif speaker == "エマ":
        spk = 8
        return spk
    elif speaker == "璃奈":
        spk = 9
        return spk
    elif speaker == "栞子":
        spk = 10
        return spk
    elif speaker == "ランジュ":
        spk = 11
        return spk
    elif speaker == "ミア":
        spk = 12
        return spk
    elif speaker == "三色绘恋1":
        spk = 13
        return spk
    elif speaker == "三色绘恋2":
        spk = 15
    elif speaker == "派蒙":
        spk = 16
        return spk
def is_japanese(string):
        for ch in string:
            if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
                return True
        return False
def is_english(string):
    import re
    pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
    if pattern.fullmatch(string):
        return True
    else:
        return False
def sle(language,tts_input0):
    if language == "中文":
        tts_input1 = "[ZH]" + tts_input0.replace('\n','。').replace(' ',',') + "[ZH]"
        return tts_input1
    if language == "自动":
        tts_input1 = f"[JA]{tts_input0}[JA]" if is_japanese(tts_input0) else f"[ZH]{tts_input0}[ZH]"
        return tts_input1
    elif language == "日文":
        tts_input1 = "[JA]" + tts_input0.replace('\n','。').replace(' ',',') + "[JA]"
        return tts_input1
def extrac(text):
    text = re.sub("<[^>]*>","",text)
    result_list = re.split(r'\n', text)
    final_list = []
    for i in result_list:
        if is_english(i):
            i = romajitable.to_kana(i).katakana
        i = i.replace('\n','').replace(' ','')
        #Current length of single sentence: 20 
        if len(i)>1:
            if len(i) > 20:
                try:
                    cur_list = re.split(r'。|!', i)
                    for i in cur_list:
                        if len(i)>1:
                            final_list.append(i+'。')
                except:
                    pass
            else:
                final_list.append(i)
    final_list = [x for x in final_list if x != '']
    print(final_list)
    return final_list
def infer(text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
    speaker_id = int(selection(speaker_id))
    a = ['【','[','(','(']
    b = ['】',']',')',')']
    for i in a:
        text = text.replace(i,'<')
    for i in b:
        text = text.replace(i,'>')
    final_list = extrac(text.replace('“','').replace('”',''))
    audio_fin = []
    c = 0
    t = datetime.timedelta(seconds=0)
    f1 = open("subtitles.srt",'w',encoding='utf-8')
    for sentence in final_list:
        c +=1
        stn_tst = get_text(sle(language,sentence), hps_ms)
        with torch.no_grad():
            x_tst = stn_tst.unsqueeze(0).to(dev)
            x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
            sid = torch.LongTensor([speaker_id]).to(dev)
            t1 = time.time()
            audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
            t2 = time.time()
            spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
            print(spending_time)
            time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
            last_time = datetime.timedelta(seconds=len(audio)/float(22050))
            t+=last_time
            time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
            print(time_end)
            f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
            audio_fin.append(audio)
    file_path = "subtitles.srt"
    return (hps_ms.data.sampling_rate, np.concatenate(audio_fin)),file_path
lan = ["中文","日文","自动"]
idols = ["高咲侑","歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"]
hps_ms = utils.get_hparams_from_file("lovelive/config.json")
net_g_ms = SynthesizerTrn(
    len(symbols),
    hps_ms.data.filter_length // 2 + 1,
    hps_ms.train.segment_size // hps_ms.data.hop_length,
    n_speakers=hps_ms.data.n_speakers,
    **hps_ms.model).to(dev)
_ = net_g_ms.eval()
_ = utils.load_checkpoint("lovelive/G_936000.pth", net_g_ms)
inputs = [gr.TextArea(label="如需实现快速合成,建议在colab上克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了?我想做只属于你一个人的学院偶像,所以,请只注视我一个人,好吗?【中文】\nなんでそんなに慣れてんだよっ?せつ菜と…何回キスしたんだよ?どこまであたしを置いてきぼりにすれば気が済むんだよ?[日文]\nI can't choose just one(English)"),
           gr.Dropdown(label="选择语言,目前勉强可以做到自动识别",choices=lan, value="自动", interactive=True),
           gr.Dropdown(label="选择说话人",choices=idols, value="歩夢", interactive=True),
           gr.Slider(minimum= 0,maximum=1.0,label="更改噪声比例,以控制情感", value=0.267),
           gr.Slider(minimum= 0,maximum=1.0,label="更改噪声偏差,以控制音素长短", value=0.7),
           gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1)]
outputs=[gr.Audio(label="采样率22050"), gr.outputs.File(label="字幕文件:subtitles.srt")]
iface = gr.Interface(
    fn=infer,
    inputs=inputs,
    outputs=outputs,
    title="Vits",
    description="虹团11人模型",
)
iface.launch()