import gradio as gr import argparse import json import datetime as dt import numpy as np from scipy.io.wavfile import write import torch from pydub import AudioSegment from model.classifier import SpecClassifier from torch.utils.data import DataLoader from text import text_to_sequence, cmudict from text.symbols import symbols import utils_data as utils from kaldiio import WriteHelper import os from tqdm import tqdm from text import text_to_sequence, convert_text import sys from model import GradTTSXvector, GradTTSWithEmo import IPython.display as ipd device = ('cuda' if torch.cuda.is_available() else 'cpu') device hps, args = utils.get_hparams_decode_two_mixture() gradtts_uncond_model = GradTTSWithEmo gradtts_uncond_model = gradtts_uncond_model(**hps.model).to(device) model = SpecClassifier( in_dim=hps.data.n_mel_channels, d_decoder=hps.model.d_decoder, h_decoder=hps.model.h_decoder, l_decoder=hps.model.l_decoder, k_decoder=hps.model.k_decoder, decoder_dropout=hps.model.decoder_dropout, n_class=hps.model.n_emos, cond_dim=hps.data.n_mel_channels, model_type=getattr(hps.model, "classifier_type", "CNN-with-time") ) # ckpt = './cnnwt_SGD_1959.pt' # ckpt_tts = './grad_uncond_cnn_001.pt' ckpt = './CNN_SGD_001_1885.pt' ckpt_tts = './grad_uncond_cnn_001.pt' utils.load_checkpoint_no_logger(ckpt_tts, gradtts_uncond_model, None) utils.load_checkpoint_no_logger(ckpt, model, None) _ = model.to(device).eval() HIFIGAN_CONFIG = './config.json' HIFIGAN_CHECKPT = './g_01720000' from models import Generator as HiFiGAN from env import AttrDict print('Initializing HiFi-GAN...') with open(HIFIGAN_CONFIG) as f: h = AttrDict(json.load(f)) vocoder = HiFiGAN(h) vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator']) _ = vocoder.to(device).eval() vocoder.remove_weight_norm() emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"]) spekears = ['Madi', 'Marzhan', 'Akzhol'] def generate_audio(text, quantity, speaker, emotion_1, emotion_2): y_dec = torch.tensor([torch.nan]) gui = 300 while torch.isnan(y_dec).sum() != 0: x, x_lengths = convert_text(text) emo_1, emo_2 = emotions.index(emotion_1), emotions.index(emotion_2) emo1 = torch.LongTensor([emo_1]).to(device) emo2 = torch.LongTensor([emo_2]).to(device) sid = torch.LongTensor([spekears.index(speaker)]).to(device) intensity = quantity / 100 y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture( x, x_lengths, n_timesteps=100, temperature=2.0, stoc=args.stoc, spk=sid, emo1=emo1, emo2=emo2, emo1_weight=intensity, length_scale=1., classifier_func=model.forward, guidance=gui, classifier_type=model.model_type ) y_dec = y_dec.detach() res = y_dec.squeeze().to(device).numpy() x = torch.from_numpy(res).unsqueeze(0) y_g_hat = vocoder(x) audio = y_g_hat.squeeze() audio = audio * 32768.0 audio = audio.detach().cpu().numpy().astype('int16') gui -= 50 if gui <= 0: break sr = 22050 return (sr, audio) demo = gr.Interface( generate_audio, [ gr.Textbox(value='Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.', label="Text you want to synthesize"), gr.Slider(0, 100, value=50, step=10, label="Count", info="Choose between 0 and 100"), # gr.Slider(0, 1000, value=100, step=10, label="Guidance", info="Choose between 0 and 1000"), gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator." ), gr.Dropdown(emotions, value=emotions[0], label="Emotion 1", info="Select first emotion"), gr.Dropdown(emotions, value=emotions[3], label="Emotion 2", info="Select second emotion." ), ], "audio", ) print('launching the app') demo.launch()