Spaces:

AdalAbilbekov
/

EmotionalIntensityControl

Runtime error

App Files Files Community

EmotionalIntensityControl / app.py

AdalAbilbekov

255f95b 6 months ago

raw

history blame contribute delete

No virus

4.33 kB

	import gradio as gr
	import argparse
	import json
	import datetime as dt
	import numpy as np
	from scipy.io.wavfile import write
	import torch
	from pydub import AudioSegment
	from model.classifier import SpecClassifier
	from torch.utils.data import DataLoader
	from text import text_to_sequence, cmudict
	from text.symbols import symbols
	import utils_data as utils
	from kaldiio import WriteHelper
	import os
	from tqdm import tqdm
	from text import text_to_sequence, convert_text
	import sys
	from model import GradTTSXvector, GradTTSWithEmo
	import IPython.display as ipd

	device = ('cuda' if torch.cuda.is_available() else 'cpu')
	device

	hps, args = utils.get_hparams_decode_two_mixture()

	gradtts_uncond_model = GradTTSWithEmo

	gradtts_uncond_model = gradtts_uncond_model(**hps.model).to(device)
	model = SpecClassifier(
	in_dim=hps.data.n_mel_channels,
	d_decoder=hps.model.d_decoder,
	h_decoder=hps.model.h_decoder,
	l_decoder=hps.model.l_decoder,
	k_decoder=hps.model.k_decoder,
	decoder_dropout=hps.model.decoder_dropout,
	n_class=hps.model.n_emos,
	cond_dim=hps.data.n_mel_channels,
	model_type=getattr(hps.model, "classifier_type", "CNN-with-time")
	)

	# ckpt = './cnnwt_SGD_1959.pt'
	# ckpt_tts = './grad_uncond_cnn_001.pt'
	ckpt = './CNN_SGD_001_1885.pt'
	ckpt_tts = './grad_uncond_cnn_001.pt'

	utils.load_checkpoint_no_logger(ckpt_tts, gradtts_uncond_model, None)
	utils.load_checkpoint_no_logger(ckpt, model, None)

	_ = model.to(device).eval()

	HIFIGAN_CONFIG = './config.json'
	HIFIGAN_CHECKPT = './g_01720000'

	from models import Generator as HiFiGAN
	from env import AttrDict
	print('Initializing HiFi-GAN...')
	with open(HIFIGAN_CONFIG) as f:
	h = AttrDict(json.load(f))
	vocoder = HiFiGAN(h)
	vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator'])
	_ = vocoder.to(device).eval()
	vocoder.remove_weight_norm()

	emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"])
	spekears = ['Madi', 'Marzhan', 'Akzhol']

	def generate_audio(text, quantity, speaker, emotion_1, emotion_2):
	y_dec = torch.tensor([torch.nan])
	gui = 300
	while torch.isnan(y_dec).sum() != 0:
	x, x_lengths = convert_text(text)
	emo_1, emo_2 = emotions.index(emotion_1), emotions.index(emotion_2)
	emo1 = torch.LongTensor([emo_1]).to(device)
	emo2 = torch.LongTensor([emo_2]).to(device)
	sid = torch.LongTensor([spekears.index(speaker)]).to(device)
	intensity = quantity / 100

	y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture(
	x, x_lengths,
	n_timesteps=100,
	temperature=2.0,
	stoc=args.stoc,
	spk=sid,
	emo1=emo1,
	emo2=emo2,
	emo1_weight=intensity,
	length_scale=1.,
	classifier_func=model.forward,
	guidance=gui,
	classifier_type=model.model_type
	)
	y_dec = y_dec.detach()
	res = y_dec.squeeze().to(device).numpy()
	x = torch.from_numpy(res).unsqueeze(0)
	y_g_hat = vocoder(x)
	audio = y_g_hat.squeeze()
	audio = audio * 32768.0
	audio = audio.detach().cpu().numpy().astype('int16')
	gui -= 50
	if gui <= 0:
	break
	sr = 22050
	return (sr, audio)

	demo = gr.Interface(
	generate_audio,
	[
	gr.Textbox(value='Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.', label="Text you want to synthesize"),
	gr.Slider(0, 100, value=50, step=10, label="Count", info="Choose between 0 and 100"),
	# gr.Slider(0, 1000, value=100, step=10, label="Guidance", info="Choose between 0 and 1000"),
	gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator."
	),
	gr.Dropdown(emotions, value=emotions[0], label="Emotion 1", info="Select first emotion"),
	gr.Dropdown(emotions, value=emotions[3], label="Emotion 2", info="Select second emotion."
	),
	],
	"audio",
	)
	print('launching the app')
	demo.launch()