import gradio as gr import argparse import json import datetime as dt import numpy as np from scipy.io.wavfile import write import gradio as gr import torch from pydub import AudioSegment from model.classifier import SpecClassifier from torch.utils.data import DataLoader from text import text_to_sequence, cmudict from text.symbols import symbols import utils_data as utils from utils import load_checkpoint_no_logger from kaldiio import WriteHelper import os from tqdm import tqdm from text import text_to_sequence, convert_text import sys from model import GradTTSXvector, GradTTSWithEmo import IPython.display as ipd device = ('cuda' if torch.cuda.is_available() else 'cpu') device hps, args = utils.get_hparams_decode_two_mixture() gradtts_uncond_model = GradTTSWithEmo gradtts_uncond_model = gradtts_uncond_model(**hps.model).to(device) model = SpecClassifier( in_dim=hps.data.n_mel_channels, d_decoder=hps.model.d_decoder, h_decoder=hps.model.h_decoder, l_decoder=hps.model.l_decoder, k_decoder=hps.model.k_decoder, decoder_dropout=hps.model.decoder_dropout, n_class=hps.model.n_emos, cond_dim=hps.data.n_mel_channels, model_type=getattr(hps.model, "classifier_type", "CNN-with-time") ) ckpt = './cnnwt_SGD_1959.pt' ckpt_tts = './grad_uncond_cnn_001.pt' utils.load_checkpoints_no_logger(ckpt_tts, gradtts_uncond_model, None) utils.load_checkpoints_no_logger(ckpt, model, None) _ = model.to(device).eval() HIFIGAN_CONFIG = './config.json' HIFIGAN_CHECKPT = './g_01720000' from models import Generator as HiFiGAN from env import AttrDict print('Initializing HiFi-GAN...') with open(HIFIGAN_CONFIG) as f: h = AttrDict(json.load(f)) vocoder = HiFiGAN(h) vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator']) _ = vocoder.to(device).eval() vocoder.remove_weight_norm() def generate_audio(text, quantity, speaker, emotion_1, emotion_2): x, x_lengths = convert_text(text) emo_1, emo_2 = emotion_1, emotion_2 emo1 = torch.LongTensor([emo_1]).to(device) emo2 = torch.LongTensor([emo_2]).to(device) sid = torch.LongTensor([spekears.index(speaker)]).to(device) intensity = quantity / 100 y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture( x, x_lengths, n_timesteps=10, temperature=2.0, stoc=args.stoc, spk=sid, emo1=emo1, emo2=emo2, emo1_weight=intensity, length_scale=1., classifier_func=model.forward, guidance=300, classifier_type=model.model_type ) y_dec = y_dec.detach() # y_dec = torch.nan_to_num(y_dec) res = y_dec.squeeze().cpu().numpy() x = torch.from_numpy(res).cuda().unsqueeze(0) y_g_hat = vocoder(x) audio = y_g_hat.squeeze() audio = audio * 32768.0 audio = audio.detach().cpu().numpy().astype('int16') sr = 22050 return sr, audio # def sentence_builder(quantity, emotion_1, emotion_2): # return f"""The {quantity} {emotion_1}s from {" and ".join(emotion_2)}""" emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"]) spekears = ['Madi', 'Marzhan', 'Akzhol'] demo = gr.Interface( generate_audio, [ gr.Slider(0, 100, value=0, step=10, label="Count", info="Choose between 0 and 100"), gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator." ), gr.Dropdown(emotions, label="Emotion 1", info="Select first emotion"), gr.Dropdown(emotions, value=emotions[3], label="Emotion 2", info="Select second emotion." ), ], "audio" ) demo.launch()