Spaces:

AdalAbilbekov
/

EmotionalIntensityControl

Runtime error

App Files Files Community

EmotionalIntensityControl / app.py

AdalAbilbekov

the last commit

28a7a23 8 months ago

raw

history blame

3.95 kB

	import gradio as gr
	import argparse
	import json
	import datetime as dt
	import numpy as np
	from scipy.io.wavfile import write
	import gradio as gr
	import torch
	from pydub import AudioSegment
	from model.classifier import SpecClassifier
	from torch.utils.data import DataLoader
	from text import text_to_sequence, cmudict
	from text.symbols import symbols
	import utils_data as utils
	from utils import load_checkpoint_no_logger
	from kaldiio import WriteHelper
	import os
	from tqdm import tqdm
	from text import text_to_sequence, convert_text
	import sys
	from model import GradTTSXvector, GradTTSWithEmo
	import IPython.display as ipd

	device = ('cuda' if torch.cuda.is_available() else 'cpu')
	device

	hps, args = utils.get_hparams_decode_two_mixture()

	gradtts_uncond_model = GradTTSWithEmo

	gradtts_uncond_model = gradtts_uncond_model(**hps.model).to(device)
	model = SpecClassifier(
	in_dim=hps.data.n_mel_channels,
	d_decoder=hps.model.d_decoder,
	h_decoder=hps.model.h_decoder,
	l_decoder=hps.model.l_decoder,
	k_decoder=hps.model.k_decoder,
	decoder_dropout=hps.model.decoder_dropout,
	n_class=hps.model.n_emos,
	cond_dim=hps.data.n_mel_channels,
	model_type=getattr(hps.model, "classifier_type", "CNN-with-time")
	)

	ckpt = './cnnwt_SGD_1959.pt'
	ckpt_tts = './grad_uncond_cnn_001.pt'

	utils.load_checkpoints_no_logger(ckpt_tts, gradtts_uncond_model, None)
	utils.load_checkpoints_no_logger(ckpt, model, None)

	_ = model.to(device).eval()

	HIFIGAN_CONFIG = './config.json'
	HIFIGAN_CHECKPT = './g_01720000'

	from models import Generator as HiFiGAN
	from env import AttrDict
	print('Initializing HiFi-GAN...')
	with open(HIFIGAN_CONFIG) as f:
	h = AttrDict(json.load(f))
	vocoder = HiFiGAN(h)
	vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator'])
	_ = vocoder.to(device).eval()
	vocoder.remove_weight_norm()

	def generate_audio(text, quantity, speaker, emotion_1, emotion_2):
	x, x_lengths = convert_text(text)
	emo_1, emo_2 = emotion_1, emotion_2
	emo1 = torch.LongTensor([emo_1]).to(device)
	emo2 = torch.LongTensor([emo_2]).to(device)
	sid = torch.LongTensor([spekears.index(speaker)]).to(device)
	intensity = quantity / 100

	y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture(
	x, x_lengths,
	n_timesteps=10,
	temperature=2.0,
	stoc=args.stoc,
	spk=sid,
	emo1=emo1,
	emo2=emo2,
	emo1_weight=intensity,
	length_scale=1.,
	classifier_func=model.forward,
	guidance=300,
	classifier_type=model.model_type
	)
	y_dec = y_dec.detach()
	# y_dec = torch.nan_to_num(y_dec)
	res = y_dec.squeeze().cpu().numpy()
	x = torch.from_numpy(res).cuda().unsqueeze(0)
	y_g_hat = vocoder(x)
	audio = y_g_hat.squeeze()
	audio = audio * 32768.0
	audio = audio.detach().cpu().numpy().astype('int16')
	sr = 22050
	return sr, audio

	# def sentence_builder(quantity, emotion_1, emotion_2):
	# return f"""The {quantity} {emotion_1}s from {" and ".join(emotion_2)}"""

	emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"])
	spekears = ['Madi', 'Marzhan', 'Akzhol']

	demo = gr.Interface(
	generate_audio,
	[
	gr.Textbox(value='Сәлем', label="Text you want to synthesize"),
	gr.Slider(0, 100, value=0, step=10, label="Count", info="Choose between 0 and 100"),
	gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator."
	),
	gr.Dropdown(emotions, label="Emotion 1", info="Select first emotion"),
	gr.Dropdown(emotions, value=emotions[3], label="Emotion 2", info="Select second emotion."
	),
	],
	"audio"
	)

	demo.launch()