File size: 8,108 Bytes
0bc5811
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f25dab
 
a7adc86
a36f6e8
 
43ef6e1
 
 
 
a36f6e8
43ef6e1
 
a36f6e8
 
d1de440
a36f6e8
d1de440
 
 
 
 
 
d220bdb
d1de440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7adc86
 
 
 
c434eb9
 
 
 
 
 
 
 
d220bdb
d1de440
d220bdb
 
a7adc86
 
d1de440
 
d220bdb
d1de440
a7adc86
4019e19
a7adc86
 
 
35b4786
 
4019e19
0b7b2a0
a7adc86
 
 
 
0b7b2a0
a7adc86
 
 
 
 
 
 
d220bdb
d1de440
b1a29ae
 
853d005
cfdd268
853d005
cfdd268
8c892cf
a7adc86
 
 
 
 
0b7b2a0
a7adc86
 
 
 
 
 
 
 
 
3d64286
a7adc86
 
 
0b7b2a0
0d34cc5
7d4fd9a
a36f6e8
 
d1de440
 
 
 
a36f6e8
d1de440
a7adc86
d1de440
43ef6e1
 
066e743
0bc5811
066e743
3d64286
a7adc86
0bc5811
 
43ef6e1
 
5bab0bb
43ef6e1
0abe5d6
43ef6e1
0bc5811
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import ONNXVITS_models
import utils
from text import text_to_sequence
import torch
import commons

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

hps = utils.get_hparams_from_file("lovelive/config.json")
symbols = hps.symbols
net_g = ONNXVITS_models.SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint("lovelive/G_525000.pth", net_g)

text1 = get_text("[JA]ありがとうございます。[JA]", hps)
stn_tst = text1
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    sid = torch.tensor([0])
    o = net_g(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)
'''
import romajitable
import re
import numpy as np
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
import IPython.display as ipd
import torch
import commons
import utils
import ONNXVITS_infer
from text.symbols import symbols
from text import text_to_sequence
import gradio as gr
import time
def get_text(text, hps):
    text_norm = text_to_sequence(text, symbols, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def selection(speaker):
    if speaker == "高咲侑":
        spk = 0
        return spk

    elif speaker == "歩夢":
        spk = 1
        return spk

    elif speaker == "かすみ":
        spk = 2
        return spk

    elif speaker == "しずく":
        spk = 3
        return spk

    elif speaker == "果林":
        spk = 4
        return spk
    
    elif speaker == "愛":
        spk = 5
        return spk

    elif speaker == "彼方":
        spk = 6
        return spk

    elif speaker == "せつ菜":
        spk = 7
        return spk
    elif speaker == "エマ":
        spk = 8
        return spk
    elif speaker == "璃奈":
        spk = 9
        return spk
    elif speaker == "栞子":
        spk = 10
        return spk
    elif speaker == "ランジュ":
        spk = 11
        return spk
    elif speaker == "ミア":
        spk = 12
        return spk
    elif speaker == "三色绘恋1":
        spk = 13
        return spk
    elif speaker == "三色绘恋2":
        spk = 15
    elif speaker == "派蒙":
        spk = 16
        return spk
def is_japanese(string):
        for ch in string:
            if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
                return True
        return False
def is_english(string):
    import re
    pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
    if pattern.fullmatch(string):
        return True
    else:
        return False
def sle(language,tts_input0):
    if language == "中文":
        tts_input1 = "[ZH]" + tts_input0.replace('\n','。').replace(' ',',') + "[ZH]"
        return tts_input1
    if language == "自动":
        tts_input1 = f"[JA]{tts_input0}[JA]" if is_japanese(tts_input0) else f"[ZH]{tts_input0}[ZH]"
        return tts_input1
    elif language == "日文":
        tts_input1 = "[JA]" + tts_input0.replace('\n','。').replace(' ',',') + "[JA]"
        return tts_input1
def extrac(text):
    text = re.sub("<[^>]*>","",text)
    result_list = re.split(r'\n', text)
    final_list = []
    for i in result_list:
        if is_english(i):
            i = romajitable.to_kana(i).katakana
        i = i.replace('\n','').replace(' ','')
        if len(i)>1:
            if len(i) > 20:
                try:
                    cur_list = re.split(r'。', i)
                    for i in cur_list:
                        if len(i)>1:
                            final_list.append(i+'。')
                except:
                    pass
            final_list.append(i)
    final_list = [x for x in final_list if x != '']
    print(final_list)
    return final_list
def infer(language,text,speaker_id, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
    speaker_id = int(selection(speaker_id))
    a = ['【','[','(','(']
    b = ['】',']',')',')']
    for i in a:
        text = text.replace(i,'<')
    for i in b:
        text = text.replace(i,'>')
    final_list = extrac(text.replace('“','').replace('”',''))
    audio_fin = []
    c = 0
    for sentence in final_list:
        c +=1
        try:
            stn_tst = get_text(sle(language,sentence), hps_ms)
            with torch.no_grad():
                x_tst = stn_tst.unsqueeze(0).to(dev)
                x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
                sid = torch.LongTensor([speaker_id]).to(dev)
                t1 = time.time()
                audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
                t2 = time.time()
                spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s" 
                print(spending_time)
                audio_fin.append(audio)
        except:
            print('存在非法字符')
    return (hps_ms.data.sampling_rate, np.concatenate(audio_fin)) 
lan = ["中文","日文","自动"]
idols = ["高咲侑","歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"]
dev = torch.device("cpu")
hps_ms = utils.get_hparams_from_file("lovelive/config.json")
net_g_ms = ONNXVITS_infer.SynthesizerTrn(
    len(symbols),
    hps_ms.data.filter_length // 2 + 1,
    hps_ms.train.segment_size // hps_ms.data.hop_length,
    n_speakers=hps_ms.data.n_speakers,
    **hps_ms.model)
_ = net_g_ms.eval()
_ = utils.load_checkpoint("lovelive/G_525000.pth", net_g_ms)
app = gr.Blocks()
with app:
    with gr.Tabs():

        with gr.TabItem("虹团vits模型,现可按句分割实现长文本合成,onnx导出后存在质量损失,建议本地运行vits模型"):

            tts_input1 = gr.TextArea(label="去标贝新模型,老版本在lovelive文件夹中", value="数千怀言者已经为你集结,列队在通往主舰桥的过道上。他们歌唱着你们名字,高声呼喊,以一种原始的、咆哮般的合唱作为对你的致敬。你从他们中间走过,一边点头,一边接受他们的赞美,你沉溺其中,几乎被他们巨大的音量所震撼。\n他们之中没有一个胆敢直视你。没有一个能够承受。你对他们超人类的眼睛来说都太过光辉。从他们中间走过时,你巨大的影子从他们身上掠过,他们立时将目光挪开,眼含泪水,吟诵你的大名时甚至不敢看你一眼。他们的吟唱中含有愤怒。几乎是疯狂的绝望。那感觉就好像他们害怕停下来,害怕自己会喘息停顿,好像尖叫出你的名字是唯一能让他们活着的事情。\n或许确实如此。作为对他们崇拜的回应,你谦虚地抬抬手,随后走进主舰桥。\nI In a word, Horus is a joker.")
            language = gr.Dropdown(label="选择语言,目前勉强可以做到自动识别",choices=lan, value="自动", interactive=True)
            para_input1 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声比例,以控制情感", value=0.667)
            para_input2 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声偏差,以控制音素长短", value=0.7)
            para_input3 = gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1)
            tts_submit = gr.Button("Generate", variant="primary")
            speaker1 = gr.Dropdown(label="选择说话人",choices=idols, value="歩夢", interactive=True)
            tts_output2 = gr.Audio(label="Output")
            tts_submit.click(infer, [language,tts_input1,speaker1,para_input1,para_input2,para_input3], [tts_output2])
    #app.launch(share=True)
    app.launch()
'''