Spaces:
Runtime error
Runtime error
import gradio as gr | |
import argparse | |
import json | |
import datetime as dt | |
import numpy as np | |
from scipy.io.wavfile import write | |
import gradio as gr | |
import torch | |
from pydub import AudioSegment | |
from model.classifier import SpecClassifier | |
from torch.utils.data import DataLoader | |
from text import text_to_sequence, cmudict | |
from text.symbols import symbols | |
import utils_data as utils | |
from utils import load_checkpoint_no_logger | |
from kaldiio import WriteHelper | |
import os | |
from tqdm import tqdm | |
from text import text_to_sequence, convert_text | |
import sys | |
from model import GradTTSXvector, GradTTSWithEmo | |
import IPython.display as ipd | |
device = ('cuda' if torch.cuda.is_available() else 'cpu') | |
device | |
hps, args = utils.get_hparams_decode_two_mixture() | |
gradtts_uncond_model = GradTTSWithEmo | |
gradtts_uncond_model = gradtts_uncond_model(**hps.model).to(device) | |
model = SpecClassifier( | |
in_dim=hps.data.n_mel_channels, | |
d_decoder=hps.model.d_decoder, | |
h_decoder=hps.model.h_decoder, | |
l_decoder=hps.model.l_decoder, | |
k_decoder=hps.model.k_decoder, | |
decoder_dropout=hps.model.decoder_dropout, | |
n_class=hps.model.n_emos, | |
cond_dim=hps.data.n_mel_channels, | |
model_type=getattr(hps.model, "classifier_type", "CNN-with-time") | |
) | |
ckpt = './cnnwt_SGD_1959.pt' | |
ckpt_tts = './grad_uncond_cnn_001.pt' | |
utils.load_checkpoints_no_logger(ckpt_tts, gradtts_uncond_model, None) | |
utils.load_checkpoints_no_logger(ckpt, model, None) | |
_ = model.to(device).eval() | |
HIFIGAN_CONFIG = './config.json' | |
HIFIGAN_CHECKPT = './g_01720000' | |
from models import Generator as HiFiGAN | |
from env import AttrDict | |
print('Initializing HiFi-GAN...') | |
with open(HIFIGAN_CONFIG) as f: | |
h = AttrDict(json.load(f)) | |
vocoder = HiFiGAN(h) | |
vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator']) | |
_ = vocoder.to(device).eval() | |
vocoder.remove_weight_norm() | |
def generate_audio(text, quantity, speaker, emotion_1, emotion_2): | |
x, x_lengths = convert_text(text) | |
emo_1, emo_2 = emotion_1, emotion_2 | |
emo1 = torch.LongTensor([emo_1]).to(device) | |
emo2 = torch.LongTensor([emo_2]).to(device) | |
sid = torch.LongTensor([spekears.index(speaker)]).to(device) | |
intensity = quantity / 100 | |
y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture( | |
x, x_lengths, | |
n_timesteps=10, | |
temperature=2.0, | |
stoc=args.stoc, | |
spk=sid, | |
emo1=emo1, | |
emo2=emo2, | |
emo1_weight=intensity, | |
length_scale=1., | |
classifier_func=model.forward, | |
guidance=300, | |
classifier_type=model.model_type | |
) | |
y_dec = y_dec.detach() | |
# y_dec = torch.nan_to_num(y_dec) | |
res = y_dec.squeeze().cpu().numpy() | |
x = torch.from_numpy(res).cuda().unsqueeze(0) | |
y_g_hat = vocoder(x) | |
audio = y_g_hat.squeeze() | |
audio = audio * 32768.0 | |
audio = audio.detach().cpu().numpy().astype('int16') | |
sr = 22050 | |
return sr, audio | |
# def sentence_builder(quantity, emotion_1, emotion_2): | |
# return f"""The {quantity} {emotion_1}s from {" and ".join(emotion_2)}""" | |
emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"]) | |
spekears = ['Madi', 'Marzhan', 'Akzhol'] | |
demo = gr.Interface( | |
generate_audio, | |
[ | |
gr.Textbox(value='Сәлем', label="Text you want to synthesize"), | |
gr.Slider(0, 100, value=0, step=10, label="Count", info="Choose between 0 and 100"), | |
gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator." | |
), | |
gr.Dropdown(emotions, label="Emotion 1", info="Select first emotion"), | |
gr.Dropdown(emotions, value=emotions[3], label="Emotion 2", info="Select second emotion." | |
), | |
], | |
"audio" | |
) | |
demo.launch() |