leo-emovits

Build error

File size: 3,473 Bytes

04749f1
72aa6e6
04749f1
 
44847f5
74ec31e
72aa6e6
5a39f1e
d8efc4e
2e910d9
5a39f1e
 
 
04749f1
5a39f1e
2e910d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72aa6e6
2e910d9
 
 
 
 
04749f1
5a39f1e
72aa6e6
5a39f1e
2e910d9
 
 
 
80be904
04749f1
3980d4c
a15140f
 
 
5a39f1e
 
 
 
2e910d9
 
 
f6fdc84
 
 
 
 
 
5a39f1e
 
 
 
f6fdc84
2e910d9
80be904
 
0e939b1
 
 
80be904
 
0e939b1
80be904
 
5a39f1e
c4cb69e
04749f1
81470ae

import gradio as gr
import os
import random
import IPython.display as ipd
import commons
import utils
import json
import torch
import tempfile
import numpy as np
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def get_text_byroma(text, hps):
    text_norm = []
    for i in text:
        text_norm.append(symbols.index(i))
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

hps = utils.get_hparams_from_file("./configs/leo.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint("logs/leo/G_4000.pth", net_g, None)

# 随机抽取情感参考音频的根目录
random_emotion_root = "wavs"
emotion_dict = json.load(open("configs/leo.json", "r"))

def tts(txt, emotion, temp_file_path):
    """emotion为参考情感音频路径或random_sample（随机抽取）"""
    if roma:
        stn_tst = get_text_byroma(txt, hps)
    else:
        stn_tst = get_text(txt, hps)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        sid = torch.LongTensor([0])
        if os.path.exists(f"{emotion}.emo.npy"):
            emo = torch.FloatTensor(np.load(f"{emotion}.emo.npy")).unsqueeze(0)
        elif emotion == "random_sample":
            while True:
                rand_wav = random.sample(os.listdir(random_emotion_root), 1)[0]
                if rand_wav.endswith('wav') and os.path.exists(f"{random_emotion_root}/{rand_wav}.emo.npy"):
                    break
            emo = torch.FloatTensor(np.load(f"{random_emotion_root}/{rand_wav}.emo.npy")).unsqueeze(0)
            print(f"{random_emotion_root}/{rand_wav}")
        elif emotion.endswith("wav"):
            import emotion_extract
            emo = torch.FloatTensor(emotion_extract.extract_wav(emotion))
        else:
            print("emotion参数不正确")

        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.2, emo=emo)[0][0,0].data.float().numpy()

        # Save the numpy array as a temporary file
        write(temp_file_path, hps.data.sampling_rate, audio)

        # Display the audio
        ipd.display(ipd.Audio(temp_file_path, rate=hps.data.sampling_rate, normalize=False))

        # Delete the temporary file
        os.remove(temp_file_path)

    return audio

def generate_audio(txt, emotion):
    # Create a temporary file
    temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    temp_file_path = temp_file.name
    audio = tts(txt, emotion, temp_file_path)
    return audio

input_text = gr.inputs.Textbox(label="输入文本")
input_emotion = gr.inputs.Dropdown(choices=["random_sample"] + os.listdir(random_emotion_root), label="参考情感音频")
output_audio = gr.outputs.Audio(type="numpy", label="合成音频")

iface = gr.Interface(fn=generate_audio, inputs=[input_text, input_emotion], outputs=output_audio)
iface.launch()