import gradio as gr
import argparse
import json
import datetime as dt
import numpy as np
from scipy.io.wavfile import write
import gradio as gr
import torch
from pydub import AudioSegment
from model.classifier import SpecClassifier
from torch.utils.data import DataLoader
from text import text_to_sequence, cmudict
from text.symbols import symbols
import utils_data as utils
from utils import load_checkpoint_no_logger
from kaldiio import WriteHelper
import os
from tqdm import tqdm
from text import text_to_sequence, convert_text
import sys
from model import GradTTSXvector, GradTTSWithEmo
import IPython.display as ipd

device = ('cuda' if torch.cuda.is_available() else 'cpu')
device

hps, args = utils.get_hparams_decode_two_mixture()

gradtts_uncond_model = GradTTSWithEmo

gradtts_uncond_model = gradtts_uncond_model(**hps.model).to(device)
model = SpecClassifier(
    in_dim=hps.data.n_mel_channels,
    d_decoder=hps.model.d_decoder,
    h_decoder=hps.model.h_decoder,
    l_decoder=hps.model.l_decoder,
    k_decoder=hps.model.k_decoder,
    decoder_dropout=hps.model.decoder_dropout,
    n_class=hps.model.n_emos,
    cond_dim=hps.data.n_mel_channels,
    model_type=getattr(hps.model, "classifier_type", "CNN-with-time")
)

ckpt = './cnnwt_SGD_1959.pt'
ckpt_tts = './grad_uncond_cnn_001.pt'

utils.load_checkpoints_no_logger(ckpt_tts, gradtts_uncond_model, None)
utils.load_checkpoints_no_logger(ckpt, model, None)

_ = model.to(device).eval()

HIFIGAN_CONFIG = './config.json'
HIFIGAN_CHECKPT = './g_01720000'

from models import Generator as HiFiGAN
from env import AttrDict
print('Initializing HiFi-GAN...')
with open(HIFIGAN_CONFIG) as f:
    h = AttrDict(json.load(f))
vocoder = HiFiGAN(h)
vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator'])
_ = vocoder.to(device).eval()
vocoder.remove_weight_norm()

def generate_audio(text, quantity, speaker, emotion_1, emotion_2):
    x, x_lengths = convert_text(text)
    emo_1, emo_2 = emotion_1, emotion_2
    emo1 = torch.LongTensor([emo_1]).to(device)
    emo2 = torch.LongTensor([emo_2]).to(device)
    sid = torch.LongTensor([spekears.index(speaker)]).to(device)
    intensity = quantity / 100

    y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture(
                        x, x_lengths,
                        n_timesteps=10,
                        temperature=2.0,
                        stoc=args.stoc,
                        spk=sid,
                        emo1=emo1,
                        emo2=emo2,
                        emo1_weight=intensity,
                        length_scale=1.,
                        classifier_func=model.forward,
                        guidance=300,
                        classifier_type=model.model_type
                    )
    y_dec = y_dec.detach()
    # y_dec = torch.nan_to_num(y_dec)
    res = y_dec.squeeze().cpu().numpy()
    x = torch.from_numpy(res).cuda().unsqueeze(0)
    y_g_hat = vocoder(x)
    audio = y_g_hat.squeeze()
    audio = audio * 32768.0
    audio = audio.detach().cpu().numpy().astype('int16')
    sr = 22050
    return sr, audio

# def sentence_builder(quantity, emotion_1, emotion_2):
#     return f"""The {quantity} {emotion_1}s from {" and ".join(emotion_2)}"""

emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"]) 
spekears = ['Madi', 'Marzhan', 'Akzhol']

demo = gr.Interface(
    generate_audio,
    [
        gr.Slider(0, 100, value=0, step=10, label="Count", info="Choose between 0 and 100"),
        gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator."
        ),
        gr.Dropdown(emotions, label="Emotion 1", info="Select first emotion"),
        gr.Dropdown(emotions, value=emotions[3], label="Emotion 2", info="Select second emotion."
        ),
    ],
    "audio"
)

demo.launch()