File size: 8,592 Bytes
8a1292d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9116564
8a1292d
 
 
 
 
 
 
 
 
 
 
 
 
 
9116564
8a1292d
9116564
 
 
 
 
 
 
 
 
 
cabd1e3
8a1292d
 
 
9116564
 
 
8a1292d
 
 
 
3da2529
8a1292d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fec9963
 
 
3ac1d80
171db4e
fec9963
8654ef4
 
 
221b8b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d45c44
3560112
171db4e
 
 
3560112
 
12de674
3560112
 
 
 
171db4e
 
3560112
 
 
 
 
171db4e
 
fec9963
8a1292d
0627904
9116564
8a1292d
d2565d9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import sys, os

if sys.platform == "darwin":
    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

import logging

logging.getLogger("numba").setLevel(logging.WARNING)
logging.getLogger("markdown_it").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("matplotlib").setLevel(logging.WARNING)

logging.basicConfig(level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s")

logger = logging.getLogger(__name__)

import torch
import argparse
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import cleaned_text_to_sequence, get_bert
from text.cleaner import clean_text
import gradio as gr
import webbrowser


net_g = None


def get_text(text, language_str, hps):
    norm_text, phone, tone, word2ph = clean_text(text, language_str)
    phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)

    if hps.data.add_blank:
        phone = commons.intersperse(phone, 0)
        tone = commons.intersperse(tone, 0)
        language = commons.intersperse(language, 0)
        for i in range(len(word2ph)):
            word2ph[i] = word2ph[i] * 2
        word2ph[0] += 1
    bert = get_bert(norm_text, word2ph, language_str)
    del word2ph

    assert bert.shape[-1] == len(phone)

    phone = torch.LongTensor(phone)
    tone = torch.LongTensor(tone)
    language = torch.LongTensor(language)

    return bert, phone, tone, language
import soundfile as sf
def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid):
    global net_g
    bert, phones, tones, lang_ids = get_text(text, "ZH", hps)
    with torch.no_grad():
        x_tst=phones.to(device).unsqueeze(0)
        tones=tones.to(device).unsqueeze(0)
        lang_ids=lang_ids.to(device).unsqueeze(0)
        bert = bert.to(device).unsqueeze(0)
        x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
        del phones
        speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
        audio = net_g.infer(x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, sdp_ratio=sdp_ratio
                           , noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()
        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
        sf.write("tmp.wav", audio, 44100)
        return audio
def convert_wav_to_ogg(wav_file):
    os.makedirs('out', exist_ok=True)
    filename = os.path.splitext(os.path.basename(wav_file.name))[0]
    output_path_ogg = os.path.join('out', f"out.ogg")

    renamed_input_path = os.path.join('in', f"in.wav")
    os.makedirs('in', exist_ok=True)
    os.rename(wav_file.name, renamed_input_path)
    command = ["ffmpeg", "-i", renamed_input_path, "-acodec", "libopus", "-y", output_path_ogg]
    os.system(" ".join(command))
    return output_path_ogg
def tts_fn(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
    with torch.no_grad():
        audio = infer(text, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid=speaker)
    with open('tmp.wav', 'rb') as wav_file:
        newogg = convert_wav_to_ogg(wav_file)    
    return "Success", (hps.data.sampling_rate, audio),newogg


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_dir", default="./logs/xingtong/xt_new.pth", help="path of your model")
    parser.add_argument("--config_dir", default="./configs/config.json", help="path of your config file")
    parser.add_argument("--share", default=False, help="make link public")
    parser.add_argument("-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log")

    args = parser.parse_args()
    if args.debug:
        logger.info("Enable DEBUG-LEVEL log")
        logging.basicConfig(level=logging.DEBUG)
    hps = utils.get_hparams_from_file(args.config_dir)
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    '''
    device = (
        "cuda:0"
        if torch.cuda.is_available()
        else (
            "mps"
            if sys.platform == "darwin" and torch.backends.mps.is_available()
            else "cpu"
        )
    )
    '''
    net_g = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model).to(device)
    _ = net_g.eval()

    _ = utils.load_checkpoint(args.model_dir, net_g, None, skip_optimizer=True)

    speaker_ids = hps.data.spk2id
    speakers = list(speaker_ids.keys())
    with gr.Blocks() as app:
        with gr.Row():
            with gr.Column():


                gr.Markdown(value="""
                星瞳 Bert-Vits2在线语音生成\n
                1、模型作者:数字星瞳企划 https://t.me/xingtong25680 \n
                \n
                2、原项目地址:https://github.com/Stardust-minus/Bert-VITS2\n
                3、使用此模型进行二创请注明AI生成,以及该项目地址。\n
                4、如果想生成超长txt文本的音频请使用colab。 https://colab.research.google.com/drive/13ek8_j1aknr-pbjj3NXxSM4vBIsracU3?usp=drive_link\n

                """)
                text = gr.TextArea(label="Text", placeholder="Input Text Here",
                                      value="这里是数字星瞳企画,请在电报搜索星瞳全拼加二五六八零,获取最新更新进展。")
                speaker = gr.Dropdown(choices=speakers, value=speakers[0], label='Speaker')
                sdp_ratio = gr.Slider(minimum=0, maximum=1, value=0.2, step=0.01, label='语调变化')
                noise_scale = gr.Slider(minimum=0.1, maximum=1.5, value=0.6, step=0.01, label='感情变化')
                noise_scale_w = gr.Slider(minimum=0.1, maximum=1.4, value=0.8, step=0.01, label='音节发音长度变化')
                length_scale = gr.Slider(minimum=0.1, maximum=2, value=1, step=0.01, label='语速')
                btn = gr.Button("开启AI语音之旅吧!", variant="primary")
            with gr.Column():
                text_output = gr.Textbox(label="Message")
                audio_output = gr.Audio(label="Output Audio")
                ogg_output = gr.File(label="Converted OGG file")
                gr.Markdown(value="""
                模型汇总:\n
                星瞳整合 https://huggingface.co/spaces/digitalxingtong/Xingtong-All-in-One\n
                甜甜叫花鸡 https://huggingface.co/spaces/digitalxingtong/Jiaohuaji-Bert-Vits2 \n
                七海 https://huggingface.co/spaces/digitalxingtong/Nanami-Bert-Vits2 \n
                东雪莲 https://huggingface.co/spaces/digitalxingtong/Azuma-Bert-Vits2 \n
                嘉然 https://huggingface.co/spaces/digitalxingtong/Jiaran-Bert-Vits2 \n
                乃琳 https://huggingface.co/spaces/digitalxingtong/Eileen-Bert-Vits2  \n
                恬豆 https://huggingface.co/spaces/digitalxingtong/Dou-Bert-Vits2 \n
                奶绿 杂谈 https://huggingface.co/spaces/digitalxingtong/Nailv-Bert-Vits2 \n
                奶绿 朗读 https://huggingface.co/spaces/digitalxingtong/Nailv-read-Bert-Vits2 \n  
                露早 https://huggingface.co/spaces/digitalxingtong/Luzao-Bert-Vits2 \n 
                柚恩 https://huggingface.co/spaces/digitalxingtong/Un-Bert-Vits2 \n
                米诺 https://huggingface.co/spaces/digitalxingtong/Minuo-Bert-Vits2 \n
                扇宝 https://huggingface.co/spaces/digitalxingtong/Shanbao-Bert-Vits2 \n
                牧牧白 https://huggingface.co/spaces/digitalxingtong/Miiu-Bert-Vits2 \n
                吉诺儿kino https://huggingface.co/spaces/digitalxingtong/Kino-Bert-Vits2 \n
                九夏 https://huggingface.co/spaces/digitalxingtong/Jiuxia-Bert-Vits2 \n
                卡缇娅 https://huggingface.co/spaces/digitalxingtong/Yaya-Bert-Vits2 \n
                理想_ideal https://huggingface.co/spaces/digitalxingtong/Lixiang-Bert-Vits2 \n
                阿梓 https://huggingface.co/spaces/digitalxingtong/Azusa-Bert-Vits2 \n
                鹿鸣 https://huggingface.co/spaces/digitalxingtong/Luming-Bert-Vits2 \n
                永雏塔菲 https://huggingface.co/spaces/digitalxingtong/Taffy-Bert-VITS2 \n
                """)
        btn.click(tts_fn,
                inputs=[text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale],
                outputs=[text_output, audio_output,ogg_output])
    
        
    app.launch(show_error=True)