File size: 4,327 Bytes
ae8e1dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2969a7
 
 
ae8e1dd
 
4b2875e
 
ae8e1dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c61a6f7
 
 
83210ed
9e9a056
83210ed
9e9a056
 
 
 
 
 
 
ae8e1dd
9e9a056
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae8e1dd
c61a6f7
ae8e1dd
 
 
 
d2969a7
 
83210ed
ae8e1dd
 
0a07595
ae8e1dd
 
 
c61a6f7
ae8e1dd
45fe7d7
ae8e1dd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import argparse
import json
import datetime as dt
import numpy as np
from scipy.io.wavfile import write
import torch
from pydub import AudioSegment
from model.classifier import SpecClassifier
from torch.utils.data import DataLoader
from text import text_to_sequence, cmudict
from text.symbols import symbols
import utils_data as utils
from kaldiio import WriteHelper
import os
from tqdm import tqdm
from text import text_to_sequence, convert_text
import sys
from model import GradTTSXvector, GradTTSWithEmo
import IPython.display as ipd

device = ('cuda' if torch.cuda.is_available() else 'cpu')
device

hps, args = utils.get_hparams_decode_two_mixture()

gradtts_uncond_model = GradTTSWithEmo

gradtts_uncond_model = gradtts_uncond_model(**hps.model).to(device)
model = SpecClassifier(
    in_dim=hps.data.n_mel_channels,
    d_decoder=hps.model.d_decoder,
    h_decoder=hps.model.h_decoder,
    l_decoder=hps.model.l_decoder,
    k_decoder=hps.model.k_decoder,
    decoder_dropout=hps.model.decoder_dropout,
    n_class=hps.model.n_emos,
    cond_dim=hps.data.n_mel_channels,
    model_type=getattr(hps.model, "classifier_type", "CNN-with-time")
)

# ckpt = './cnnwt_SGD_1959.pt'
# ckpt_tts = './grad_uncond_cnn_001.pt'
ckpt = './CNN_SGD_001_1885.pt'
ckpt_tts = './grad_uncond_cnn_001.pt'

utils.load_checkpoint_no_logger(ckpt_tts, gradtts_uncond_model, None)
utils.load_checkpoint_no_logger(ckpt, model, None)

_ = model.to(device).eval()

HIFIGAN_CONFIG = './config.json'
HIFIGAN_CHECKPT = './g_01720000'

from models import Generator as HiFiGAN
from env import AttrDict
print('Initializing HiFi-GAN...')
with open(HIFIGAN_CONFIG) as f:
    h = AttrDict(json.load(f))
vocoder = HiFiGAN(h)
vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator'])
_ = vocoder.to(device).eval()
vocoder.remove_weight_norm()

emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"]) 
spekears = ['Madi', 'Marzhan', 'Akzhol']

def generate_audio(text, quantity, speaker, emotion_1, emotion_2):
    y_dec = torch.tensor([torch.nan])
    gui = 300
    while torch.isnan(y_dec).sum() != 0:
        x, x_lengths = convert_text(text)
        emo_1, emo_2 = emotions.index(emotion_1), emotions.index(emotion_2)
        emo1 = torch.LongTensor([emo_1]).to(device)
        emo2 = torch.LongTensor([emo_2]).to(device)
        sid = torch.LongTensor([spekears.index(speaker)]).to(device)
        intensity = quantity / 100

        y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture(
                            x, x_lengths,
                            n_timesteps=100,
                            temperature=2.0,
                            stoc=args.stoc,
                            spk=sid,
                            emo1=emo1,
                            emo2=emo2,
                            emo1_weight=intensity,
                            length_scale=1.,
                            classifier_func=model.forward,
                            guidance=gui,
                            classifier_type=model.model_type
                        )
        y_dec = y_dec.detach()
        res = y_dec.squeeze().to(device).numpy()
        x = torch.from_numpy(res).unsqueeze(0)
        y_g_hat = vocoder(x)
        audio = y_g_hat.squeeze()
        audio = audio * 32768.0
        audio = audio.detach().cpu().numpy().astype('int16')
        gui -= 50
        if gui <= 0:
            break
    sr = 22050
    return (sr, audio)

demo = gr.Interface(
    generate_audio,
    [
        gr.Textbox(value='Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.', label="Text you want to synthesize"),
        gr.Slider(0, 100, value=50, step=10, label="Count", info="Choose between 0 and 100"),
        # gr.Slider(0, 1000, value=100, step=10, label="Guidance", info="Choose between 0 and 1000"),
        gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator."
        ),
        gr.Dropdown(emotions, value=emotions[0], label="Emotion 1", info="Select first emotion"),
        gr.Dropdown(emotions, value=emotions[3], label="Emotion 2", info="Select second emotion."
        ),
    ],
    "audio",
)
print('launching the app')
demo.launch()