Spaces:

AdalAbilbekov
/

EmotionalIntensityControl

Runtime error

App Files Files Community

AdalAbilbekov commited on Mar 24

Commit

ae8e1dd

•

1 Parent(s): 9d9a36a

First commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
app.py +117 -0
cnnwt_SGD_1959.pt +3 -0
config.json +37 -0
configs/.DS_Store +0 -0
configs/hifigan-config.json +37 -0
configs/train_grad.json +68 -0
data_collate.py +147 -0
data_loader.py +309 -0
data_preparation.py +108 -0
env.py +15 -0
filelists/all_spks/eval_utts.txt +3730 -0
filelists/all_spks/feats.ark +0 -0
filelists/all_spks/feats.scp +3 -0
filelists/all_spks/text +0 -0
filelists/all_spks/train_utts.txt +0 -0
filelists/all_spks/utt2emo.json +0 -0
filelists/all_spks/utt2spk.json +0 -0
filelists/inference_generated.txt +2 -0
g_01720000 +3 -0
grad_uncond.pt +3 -0
grad_uncond_10k_conf.pt +3 -0
grad_uncond_cnn_001.pt +3 -0
inference_EMA.py +89 -0
inference_intensity_control.ipynb +0 -0
melspec.py +40 -0
model/__init__.py +2 -0
model/__pycache__/__init__.cpython-39.pyc +0 -0
model/__pycache__/tts.cpython-39.pyc +0 -0
model/base.py +28 -0
model/classifier.py +690 -0
model/diffusion.py +513 -0
model/monotonic_align/LICENCE +21 -0
model/monotonic_align/__init__.py +23 -0
model/monotonic_align/__pycache__/__init__.cpython-39.pyc +0 -0
model/monotonic_align/build/lib.macosx-11.1-arm64-cpython-310/model/monotonic_align/core.cpython-310-darwin.so +0 -0
model/monotonic_align/build/temp.linux-x86_64-3.6/core.o +3 -0
model/monotonic_align/build/temp.macosx-10.9-x86_64-3.6/core.o +0 -0
model/monotonic_align/build/temp.macosx-11.1-arm64-cpython-310/core.o +0 -0
model/monotonic_align/core.c +0 -0
model/monotonic_align/core.pyx +45 -0
model/monotonic_align/model/monotonic_align/core.cpython-310-darwin.so +0 -0
model/monotonic_align/setup.py +11 -0
model/text_encoder.py +326 -0
model/tts.py +558 -0
model/utils.py +44 -0
models.py +283 -0
text/.DS_Store +0 -0
text/LICENSE +30 -0
text/__init__.py +106 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+g_01720000 filter=lfs diff=lfs merge=lfs -text
+model/monotonic_align/build/temp.linux-x86_64-3.6/core.o filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import gradio as gr
+import argparse
+import json
+import datetime as dt
+import numpy as np
+from scipy.io.wavfile import write
+import gradio as gr
+import torch
+from pydub import AudioSegment
+from model.classifier import SpecClassifier
+from torch.utils.data import DataLoader
+from text import text_to_sequence, cmudict
+from text.symbols import symbols
+import utils_data as utils
+from utils import load_checkpoint_no_logger
+from kaldiio import WriteHelper
+import os
+from tqdm import tqdm
+from text import text_to_sequence, convert_text
+import sys
+from model import GradTTSXvector, GradTTSWithEmo
+import IPython.display as ipd
+device = ('cuda' if torch.cuda.is_available() else 'cpu')
+device
+hps, args = utils.get_hparams_decode_two_mixture()
+gradtts_uncond_model = GradTTSWithEmo
+gradtts_uncond_model = gradtts_uncond_model(**hps.model).to(device)
+model = SpecClassifier(
+    in_dim=hps.data.n_mel_channels,
+    d_decoder=hps.model.d_decoder,
+    h_decoder=hps.model.h_decoder,
+    l_decoder=hps.model.l_decoder,
+    k_decoder=hps.model.k_decoder,
+    decoder_dropout=hps.model.decoder_dropout,
+    n_class=hps.model.n_emos,
+    cond_dim=hps.data.n_mel_channels,
+    model_type=getattr(hps.model, "classifier_type", "CNN-with-time")
+)
+ckpt = './cnnwt_SGD_1959.pt'
+ckpt_tts = './grad_uncond_cnn_001.pt'
+utils.load_checkpoints_no_logger(ckpt_tts, gradtts_uncond_model, None)
+utils.load_checkpoints_no_logger(ckpt, model, None)
+_ = model.to(device).eval()
+HIFIGAN_CONFIG = './config.json'
+HIFIGAN_CHECKPT = './g_01720000'
+from models import Generator as HiFiGAN
+from env import AttrDict
+print('Initializing HiFi-GAN...')
+with open(HIFIGAN_CONFIG) as f:
+    h = AttrDict(json.load(f))
+vocoder = HiFiGAN(h)
+vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator'])
+_ = vocoder.to(device).eval()
+vocoder.remove_weight_norm()
+def generate_audio(text, quantity, speaker, emotion_1, emotion_2):
+    x, x_lengths = convert_text(text)
+    emo_1, emo_2 = emotion_1, emotion_2
+    emo1 = torch.LongTensor([emo_1]).to(device)
+    emo2 = torch.LongTensor([emo_2]).to(device)
+    sid = torch.LongTensor([spekears.index(speaker)]).to(device)
+    intensity = quantity / 100
+    y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture(
+                        x, x_lengths,
+                        n_timesteps=10,
+                        temperature=2.0,
+                        stoc=args.stoc,
+                        spk=sid,
+                        emo1=emo1,
+                        emo2=emo2,
+                        emo1_weight=intensity,
+                        length_scale=1.,
+                        classifier_func=model.forward,
+                        guidance=300,
+                        classifier_type=model.model_type
+                    )
+    y_dec = y_dec.detach()
+    # y_dec = torch.nan_to_num(y_dec)
+    res = y_dec.squeeze().cpu().numpy()
+    x = torch.from_numpy(res).cuda().unsqueeze(0)
+    y_g_hat = vocoder(x)
+    audio = y_g_hat.squeeze()
+    audio = audio * 32768.0
+    audio = audio.detach().cpu().numpy().astype('int16')
+    sr = 22050
+    return sr, audio
+# def sentence_builder(quantity, emotion_1, emotion_2):
+#     return f"""The {quantity} {emotion_1}s from {" and ".join(emotion_2)}"""
+emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"])
+spekears = ['Madi', 'Marzhan', 'Akzhol']
+demo = gr.Interface(
+    generate_audio,
+    [
+        gr.Slider(0, 100, value=0, step=10, label="Count", info="Choose between 0 and 100"),
+        gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator."
+        ),
+        gr.Dropdown(emotions, label="Emotion 1", info="Select first emotion"),
+        gr.Dropdown(emotions, value=emotions[3], label="Emotion 2", info="Select second emotion."
+        ),
+    ],
+    "audio"
+)
+demo.launch()

cnnwt_SGD_1959.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dad97a741b8faad42f6d4c0ccd808f20cd4d1e01890db0c3935d131dfafc9977
+size 1948051

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "resblock": "1",
+    "num_gpus": 1,
+    "batch_size": 64,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "upsample_rates": [8,8,2,2],
+    "upsample_kernel_sizes": [16,16,4,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "segment_size": 8192,
+    "num_mels": 80,
+    "num_freq": 1025,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+    "sampling_rate": 22050,
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_for_loss": null,
+    "num_workers": 4,
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54320",
+        "world_size": 1
+    }
+}

configs/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

configs/hifigan-config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "resblock": "1",
+    "num_gpus": 1,
+    "batch_size": 64,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "upsample_rates": [8,8,2,2],
+    "upsample_kernel_sizes": [16,16,4,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "segment_size": 8192,
+    "num_mels": 80,
+    "num_freq": 1025,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+    "sampling_rate": 22050,
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_for_loss": null,
+    "num_workers": 4,
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54320",
+        "world_size": 1
+    }
+}

configs/train_grad.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "xvector": false,
+  "pe": false,
+  "train": {
+    "test_size": 6,
+    "n_epochs": 10000,
+    "batch_size": 64,
+    "learning_rate": 1e-4,
+    "seed": 37,
+    "save_every": 1,
+    "use_gt_dur": false
+  },
+  "data": {
+    "load_mel_from_disk": false,
+    "train_utts": "filelists/all_spks/train_utts.txt",
+    "val_utts": "filelists/all_spks/eval_utts.txt",
+    "train_utt2phns": "filelists/all_spks/text",
+    "val_utt2phns": "filelists/all_spks/text",
+    "train_feats_scp": "filelists/all_spks/feats.scp",
+    "val_feats_scp": "filelists/all_spks/feats.scp",
+    "train_utt2spk": "filelists/all_spks/utt2spk.json",
+    "val_utt2spk": "filelists/all_spks/utt2spk.json",
+    "train_utt2emo": "filelists/all_spks/utt2emo.json",
+    "val_utt2emo": "filelists/all_spks/utt2emo.json",
+    "train_var_scp": "",
+    "val_var_scp": "",
+    "text_cleaners": [
+      "kazakh_cleaners"
+    ],
+    "max_wav_value": 32768.0,
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 200,
+    "win_length": 800,
+    "n_mel_channels": 80,
+    "mel_fmin": 20.0,
+    "mel_fmax": 8000.0,
+    "utt2phn_path": "data/res_utt2phns.json",
+    "add_blank": false
+  },
+  "model": {
+    "n_vocab": 200,
+    "n_spks": 3,
+    "n_emos": 6,
+    "spk_emb_dim": 64,
+    "n_enc_channels": 192,
+    "filter_channels": 768,
+    "filter_channels_dp": 256,
+    "n_enc_layers": 6,
+    "enc_kernel": 3,
+    "enc_dropout": 0.1,
+    "n_heads": 2,
+    "window_size": 4,
+    "dec_dim": 64,
+    "beta_min": 0.05,
+    "beta_max": 20.0,
+    "pe_scale": 1000,
+    "d_decoder": 128,
+    "l_decoder": 3,
+    "k_decoder": 7,
+    "h_decoder": 4,
+    "decoder_dropout":0.1,
+    "classifier_type": "CNN-with-time"
+  }
+}

data_collate.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import os.path
+import random
+import numpy as np
+import torch
+import re
+import torch.utils.data
+import json
+import kaldiio
+from tqdm import tqdm
+class BaseCollate:
+    def __init__(self, n_frames_per_step=1):
+        self.n_frames_per_step = n_frames_per_step
+    def collate_text_mel(self, batch: [dict]):
+        """
+        :param batch: list of dicts
+        """
+        utt = list(map(lambda x: x['utt'], batch))
+        input_lengths, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([len(x['text']) for x in batch]),
+            dim=0, descending=True)
+        max_input_len = input_lengths[0]
+        text_padded = torch.LongTensor(len(batch), max_input_len)
+        text_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            text = batch[ids_sorted_decreasing[i]]['text']
+            text_padded[i, :text.size(0)] = text
+        # Right zero-pad mel-spec
+        num_mels = batch[0]['mel'].size(0)
+        max_target_len = max([x['mel'].size(1) for x in batch])
+        if max_target_len % self.n_frames_per_step != 0:
+            max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
+            assert max_target_len % self.n_frames_per_step == 0
+        # include mel padded
+        mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
+        mel_padded.zero_()
+        output_lengths = torch.LongTensor(len(batch))
+        for i in range(len(ids_sorted_decreasing)):
+            mel = batch[ids_sorted_decreasing[i]]['mel']
+            mel_padded[i, :, :mel.size(1)] = mel
+            output_lengths[i] = mel.size(1)
+        utt_name = np.array(utt)[ids_sorted_decreasing].tolist()
+        if isinstance(utt_name, str):
+            utt_name = [utt_name]
+        res = {
+            "utt": utt_name,
+            "text_padded": text_padded,
+            "input_lengths": input_lengths,
+            "mel_padded": mel_padded,
+            "output_lengths": output_lengths,
+        }
+        return res, ids_sorted_decreasing
+class SpkIDCollate(BaseCollate):
+    def __call__(self, batch, *args, **kwargs):
+        base_data, ids_sorted_decreasing = self.collate_text_mel(batch)
+        spk_ids = torch.LongTensor(list(map(lambda x: x["spk_ids"], batch)))
+        spk_ids = spk_ids[ids_sorted_decreasing]
+        base_data.update({
+            "spk_ids": spk_ids
+        })
+        return base_data
+class SpkIDCollateWithEmo(BaseCollate):
+    def __call__(self, batch, *args, **kwargs):
+        base_data, ids_sorted_decreasing = self.collate_text_mel(batch)
+        spk_ids = torch.LongTensor(list(map(lambda x: x["spk_ids"], batch)))
+        spk_ids = spk_ids[ids_sorted_decreasing]
+        emo_ids = torch.LongTensor(list(map(lambda x: x['emo_ids'], batch)))
+        emo_ids = emo_ids[ids_sorted_decreasing]
+        base_data.update({
+            "spk_ids": spk_ids,
+            "emo_ids": emo_ids
+        })
+        return base_data
+class XvectorCollate(BaseCollate):
+    def __call__(self, batch, *args, **kwargs):
+        base_data, ids_sorted_decreasing = self.collate_text_mel(batch)
+        xvectors = torch.cat(list(map(lambda x: x["xvector"].unsqueeze(0), batch)), dim=0)
+        xvectors = xvectors[ids_sorted_decreasing]
+        base_data.update({
+            "xvector": xvectors
+        })
+        return base_data
+class SpkIDCollateWithPE(BaseCollate):
+    def __call__(self, batch, *args, **kwargs):
+        base_data, ids_sorted_decreasing = self.collate_text_mel(batch)
+        spk_ids = torch.LongTensor(list(map(lambda x: x["spk_ids"], batch)))
+        spk_ids = spk_ids[ids_sorted_decreasing]
+        num_var = batch[0]["var"].size(0)
+        max_target_len = max([x["var"].size(1) for x in batch])
+        if max_target_len % self.n_frames_per_step != 0:
+            max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
+            assert max_target_len % self.n_frames_per_step == 0
+        var_padded = torch.FloatTensor(len(batch), num_var, max_target_len)
+        var_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            var = batch[ids_sorted_decreasing[i]]["var"]
+            var_padded[i, :, :var.size(1)] = var
+        base_data.update({
+            "spk_ids": spk_ids,
+            "var_padded": var_padded
+        })
+        return base_data
+class XvectorCollateWithPE(BaseCollate):
+    def __call__(self, batch, *args, **kwargs):
+        base_data, ids_sorted_decreasing = self.collate_text_mel(batch)
+        xvectors = torch.cat(list(map(lambda x: x["xvector"].unsqueeze(0), batch)), dim=0)
+        xvectors = xvectors[ids_sorted_decreasing]
+        num_var = batch[0]["var"].size(0)
+        max_target_len = max([x["var"].size(1) for x in batch])
+        if max_target_len % self.n_frames_per_step != 0:
+            max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
+            assert max_target_len % self.n_frames_per_step == 0
+        var_padded = torch.FloatTensor(len(batch), num_var, max_target_len)
+        var_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            var = batch[ids_sorted_decreasing[i]]["var"]
+            var_padded[i, :, :var.size(1)] = var
+        base_data.update({
+            "xvector": xvectors,
+            "var_padded": var_padded
+        })
+        return base_data

data_loader.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import os.path
+import random
+import numpy as np
+import torch
+import re
+import torch.utils.data
+import json
+import kaldiio
+from tqdm import tqdm
+from text import text_to_sequence
+class BaseLoader(torch.utils.data.Dataset):
+    def __init__(self, utts: str, hparams, feats_scp: str, utt2text:str):
+        """
+        :param utts: file path. A list of utts for this loader. These are the only utts that this loader has access.
+        This loader only deals with text, duration and feats. Other files despite `utts` can be larger.
+        """
+        self.n_mel_channels = hparams.n_mel_channels
+        self.sampling_rate = hparams.sampling_rate
+        self.utts = self.get_utts(utts)
+        self.utt2feat = self.get_utt2feat(feats_scp)
+        self.utt2text = self.get_utt2text(utt2text)
+    def get_utts(self, utts: str) -> list:
+        with open(utts, 'r') as f:
+            L = f.readlines()
+            L = list(map(lambda x: x.strip(), L))
+            random.seed(1234)
+            random.shuffle(L)
+        return L
+    def get_utt2feat(self, feats_scp: str):
+        utt2feat = kaldiio.load_scp(feats_scp)  # lazy load mode
+        print(f"Succeed reading feats from {feats_scp}")
+        return utt2feat
+    def get_utt2text(self, utt2text: str):
+        with open(utt2text, 'r') as f:
+            L = f.readlines()
+            utt2text = {line.split()[0]: line.strip().split(" ", 1)[1] for line in L}
+        return utt2text
+    def get_mel_from_kaldi(self, utt):
+        feat = self.utt2feat[utt]
+        feat = torch.FloatTensor(feat).squeeze()
+        assert self.n_mel_channels in feat.shape
+        if feat.shape[0] == self.n_mel_channels:
+            return feat
+        else:
+            return feat.T
+    def get_text(self, utt):
+        text = self.utt2text[utt]
+        text_norm = text_to_sequence(text)
+        text_norm = torch.IntTensor(text_norm)
+        return text_norm
+    def __getitem__(self, index):
+        res = self.get_mel_text_pair(self.utts[index])
+        return res
+    def __len__(self):
+        return len(self.utts)
+    def sample_test_batch(self, size):
+        idx = np.random.choice(range(len(self)), size=size, replace=False)
+        test_batch = []
+        for index in idx:
+            test_batch.append(self.__getitem__(index))
+        return test_batch
+class SpkIDLoader(BaseLoader):
+    def __init__(self, utts: str, hparams, feats_scp: str, utt2phns: str, phn2id: str,
+                 utt2phn_duration: str, utt2spk: str):
+        """
+        :param utt2spk: json file path (utt name -> spk id)
+        This loader loads speaker as a speaker ID for embedding table
+        """
+        super(SpkIDLoader, self).__init__(utts, hparams, feats_scp, utt2phns, phn2id, utt2phn_duration)
+        self.utt2spk = self.get_utt2spk(utt2spk)
+    def get_utt2spk(self, utt2spk: str) -> dict:
+        with open(utt2spk, 'r') as f:
+            res = json.load(f)
+        return res
+    def get_mel_text_pair(self, utt):
+        # separate filename and text
+        spkid = self.utt2spk[utt]
+        phn_ids = self.get_text(utt)
+        mel = self.get_mel_from_kaldi(utt)
+        dur = self.get_dur_from_kaldi(utt)
+        assert sum(dur) == mel.shape[1], f"Frame length mismatch: utt {utt}, dur: {sum(dur)}, mel: {mel.shape[1]}"
+        res = {
+            "utt": utt,
+            "mel": mel,
+            "spk_ids": spkid
+        }
+        return res
+    def __getitem__(self, index):
+        res = self.get_mel_text_pair(self.utts[index])
+        return res
+    def __len__(self):
+        return len(self.utts)
+class SpkIDLoaderWithEmo(BaseLoader):
+    def __init__(self, utts: str, hparams, feats_scp: str, utt2text:str, utt2spk: str, utt2emo: str):
+        """
+        :param utt2spk: json file path (utt name -> spk id)
+        This loader loads speaker as a speaker ID for embedding table
+        """
+        super(SpkIDLoaderWithEmo, self).__init__(utts, hparams, feats_scp, utt2text)
+        self.utt2spk = self.get_utt2spk(utt2spk)
+        self.utt2emo = self.get_utt2emo(utt2emo)
+    def get_utt2spk(self, utt2spk: str) -> dict:
+        with open(utt2spk, 'r') as f:
+            res = json.load(f)
+        return res
+    def get_utt2emo(self, utt2emo: str) -> dict:
+        with open(utt2emo, 'r') as f:
+            res = json.load(f)
+        return res
+    def get_mel_text_pair(self, utt):
+        # separate filename and text
+        spkid = int(self.utt2spk[utt])
+        emoid = int(self.utt2emo[utt])
+        text = self.get_text(utt)
+        mel = self.get_mel_from_kaldi(utt)
+        res = {
+            "utt": utt,
+            "text": text,
+            "mel": mel,
+            "spk_ids": spkid,
+            "emo_ids": emoid
+        }
+        return res
+    def __getitem__(self, index):
+        res = self.get_mel_text_pair(self.utts[index])
+        return res
+    def __len__(self):
+        return len(self.utts)
+class SpkIDLoaderWithPE(SpkIDLoader):
+    def __init__(self, utts: str, hparams, feats_scp: str, utt2phns: str, phn2id: str,
+                 utt2phn_duration: str, utt2spk: str, var_scp: str):
+        """
+        This loader loads speaker ID together with variance (4-dim pitch, 1-dim energy)
+        """
+        super(SpkIDLoaderWithPE, self).__init__(utts, hparams, feats_scp, utt2phns, phn2id, utt2phn_duration, utt2spk)
+        self.utt2var = self.get_utt2var(var_scp)
+    def get_utt2var(self, utt2var: str) -> dict:
+        res = kaldiio.load_scp(utt2var)
+        print(f"Succeed reading feats from {utt2var}")
+        return res
+    def get_var_from_kaldi(self, utt):
+        var = self.utt2var[utt]
+        var = torch.FloatTensor(var).squeeze()
+        assert 5 in var.shape
+        if var.shape[0] == 5:
+            return var
+        else:
+            return var.T
+    def get_mel_text_pair(self, utt):
+        # separate filename and text
+        spkid = self.utt2spk[utt]
+        phn_ids = self.get_text(utt)
+        mel = self.get_mel_from_kaldi(utt)
+        dur = self.get_dur_from_kaldi(utt)
+        var = self.get_var_from_kaldi(utt)
+        assert sum(dur) == mel.shape[1] == var.shape[1], \
+            f"Frame length mismatch: utt {utt}, dur: {sum(dur)}, mel: {mel.shape[1]}, var: {var.shape[1]}"
+        res = {
+            "utt": utt,
+            "phn_ids": phn_ids,
+            "mel": mel,
+            "dur": dur,
+            "spk_ids": spkid,
+            "var": var
+        }
+        return res
+class XvectorLoader(BaseLoader):
+    def __init__(self, utts: str, hparams, feats_scp: str, utt2phns: str, phn2id: str,
+                 utt2phn_duration: str, utt2spk_name: str, spk_xvector_scp: str):
+        """
+        :param utt2spk_name: like kaldi-style utt2spk
+        :param spk_xvector_scp: kaldi-style speaker-level xvector.scp
+        """
+        super(XvectorLoader, self).__init__(utts, hparams, feats_scp, utt2phns, phn2id, utt2phn_duration)
+        self.utt2spk = self.get_utt2spk(utt2spk_name)
+        self.spk2xvector = self.get_spk2xvector(spk_xvector_scp)
+    def get_utt2spk(self, utt2spk):
+        res = dict()
+        with open(utt2spk, 'r') as f:
+            for l in f.readlines():
+                res[l.split()[0]] = l.split()[1]
+        return res
+    def get_spk2xvector(self, spk_xvector_scp: str) -> dict:
+        res = kaldiio.load_scp(spk_xvector_scp)
+        print(f"Succeed reading xvector from {spk_xvector_scp}")
+        return res
+    def get_xvector(self, utt):
+        xv = self.spk2xvector[self.utt2spk[utt]]
+        xv = torch.FloatTensor(xv).squeeze()
+        return xv
+    def get_mel_text_pair(self, utt):
+        phn_ids = self.get_text(utt)
+        mel = self.get_mel_from_kaldi(utt)
+        dur = self.get_dur_from_kaldi(utt)
+        xvector = self.get_xvector(utt)
+        assert sum(dur) == mel.shape[1], \
+            f"Frame length mismatch: utt {utt}, dur: {sum(dur)}, mel: {mel.shape[1]}"
+        res = {
+            "utt": utt,
+            "phn_ids": phn_ids,
+            "mel": mel,
+            "dur": dur,
+            "xvector": xvector,
+        }
+        return res
+class XvectorLoaderWithPE(BaseLoader):
+    def __init__(self, utts: str, hparams, feats_scp: str, utt2phns: str, phn2id: str,
+                 utt2phn_duration: str, utt2spk_name: str, spk_xvector_scp: str, var_scp: str):
+        super(XvectorLoaderWithPE, self).__init__(utts, hparams, feats_scp, utt2phns, phn2id, utt2phn_duration)
+        self.utt2spk = self.get_utt2spk(utt2spk_name)
+        self.spk2xvector = self.get_spk2xvector(spk_xvector_scp)
+        self.utt2var = self.get_utt2var(var_scp)
+    def get_spk2xvector(self, spk_xvector_scp: str) -> dict:
+        res = kaldiio.load_scp(spk_xvector_scp)
+        print(f"Succeed reading xvector from {spk_xvector_scp}")
+        return res
+    def get_utt2spk(self, utt2spk):
+        res = dict()
+        with open(utt2spk, 'r') as f:
+            for l in f.readlines():
+                res[l.split()[0]] = l.split()[1]
+        return res
+    def get_utt2var(self, utt2var: str) -> dict:
+        res = kaldiio.load_scp(utt2var)
+        print(f"Succeed reading feats from {utt2var}")
+        return res
+    def get_var_from_kaldi(self, utt):
+        var = self.utt2var[utt]
+        var = torch.FloatTensor(var).squeeze()
+        assert 5 in var.shape
+        if var.shape[0] == 5:
+            return var
+        else:
+            return var.T
+    def get_xvector(self, utt):
+        xv = self.spk2xvector[self.utt2spk[utt]]
+        xv = torch.FloatTensor(xv).squeeze()
+        return xv
+    def get_mel_text_pair(self, utt):
+        # separate filename and text
+        spkid = self.utt2spk[utt]
+        phn_ids = self.get_text(utt)
+        mel = self.get_mel_from_kaldi(utt)
+        dur = self.get_dur_from_kaldi(utt)
+        var = self.get_var_from_kaldi(utt)
+        xvector = self.get_xvector(utt)
+        assert sum(dur) == mel.shape[1] == var.shape[1], \
+            f"Frame length mismatch: utt {utt}, dur: {sum(dur)}, mel: {mel.shape[1]}, var: {var.shape[1]}"
+        res = {
+            "utt": utt,
+            "phn_ids": phn_ids,
+            "mel": mel,
+            "dur": dur,
+            "spk_ids": spkid,
+            "var": var,
+            "xvector": xvector
+        }
+        return res

data_preparation.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import kaldiio
+import os
+import librosa
+from tqdm import tqdm
+import glob
+import json
+from shutil import copyfile
+import pandas as pd
+import argparse
+from text import _clean_text, symbols
+from num2words import num2words
+import re
+from melspec import mel_spectrogram
+import torchaudio
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--data', type=str, required=True, help='path to the emotional dataset')
+    args = parser.parse_args()
+    dataset_path = args.data
+    filelists_path = 'filelists/all_spks/'
+    feats_scp_file = filelists_path + 'feats.scp'
+    feats_ark_file = filelists_path + 'feats.ark'
+    spks = ['1263201035', '805570882', '399172782']
+    train_files = []
+    eval_files = []
+    for spk in spks:
+        train_files += glob.glob(dataset_path + spk + "/train/*.wav")
+        eval_files += glob.glob(dataset_path + spk + "/eval/*.wav")
+    os.makedirs(filelists_path, exist_ok=True)
+    with open(filelists_path + 'train_utts.txt', 'w', encoding='utf-8') as f:
+        for wav_path in train_files:
+            wav_name = os.path.splitext(os.path.basename(wav_path))[0]
+            f.write(wav_name + '\n')
+    with open(filelists_path + 'eval_utts.txt', 'w', encoding='utf-8') as f:
+        for wav_path in eval_files:
+            wav_name = os.path.splitext(os.path.basename(wav_path))[0]
+            f.write(wav_name + '\n')
+    with open(feats_scp_file, 'w') as feats_scp, \
+        kaldiio.WriteHelper(f'ark,scp:{feats_ark_file},{feats_scp_file}') as writer:
+        for root, dirs, files in os.walk(dataset_path):
+            for file in tqdm(files):
+                if file.endswith('.wav'):
+                    # Get the file name and relative path to the root folder
+                    wav_path = os.path.join(root, file)
+                    rel_path = os.path.relpath(wav_path, dataset_path)
+                    wav_name = os.path.splitext(os.path.basename(wav_path))[0]
+                    signal, rate = torchaudio.load(wav_path)
+                    spec = mel_spectrogram(signal, 1024, 80, 22050, 256,
+                              1024, 0, 8000, center=False).squeeze()
+                    # Write the features to feats.ark and feats.scp
+                    writer[wav_name] = spec
+    emotions = [os.path.basename(x).split("_")[1] for x in glob.glob(dataset_path + '/**/**/*')]
+    emotions = sorted(set(emotions))
+    utt2spk = {}
+    utt2emo = {}
+    wavs = glob.glob(dataset_path + '**/**/*.wav')
+    for wav_path in tqdm(wavs):
+        wav_name = os.path.splitext(os.path.basename(wav_path))[0]
+        emotion =  emotions.index(wav_name.split("_")[1])
+        if wav_path.split('/')[-3] == '1263201035':
+            spk = 0 ## labels should start with 0
+        elif wav_path.split('/')[-3] == '805570882':
+            spk = 1
+        else:
+            spk = 2
+        utt2spk[wav_name] = str(spk)
+        utt2emo[wav_name] = str(emotion)
+    utt2spk = dict(sorted(utt2spk.items()))
+    utt2emo = dict(sorted(utt2emo.items()))
+    with open(filelists_path + 'utt2emo.json', 'w') as fp:
+        json.dump(utt2emo, fp,  indent=4)
+    with open(filelists_path + 'utt2spk.json', 'w') as fp:
+        json.dump(utt2spk, fp,  indent=4)
+    txt_files = sorted(glob.glob(dataset_path + '/**/**/*.txt'))
+    count = 0
+    txt = []
+    basenames = []
+    utt2text = {}
+    flag = False
+    with open(filelists_path + 'text', 'w', encoding='utf-8') as write:
+        for txt_path in txt_files:
+            basename = os.path.basename(txt_path).replace('.txt', '')
+            with open(txt_path, 'r', encoding='utf-8') as f:
+                txt.append(_clean_text(f.read().strip("\n"), cleaner_names=["kazakh_cleaners"]).replace("'", ""))
+                basenames.append(basename)
+    output_string = [re.sub('(\d+)', lambda m: num2words(m.group(), lang='kz'), sentence) for sentence in txt]
+    cleaned_txt = []
+    for t in output_string:
+        cleaned_txt.append(''.join([s for s in t if s in symbols]))
+    utt2text = {basenames[i]: cleaned_txt[i] for i in range(len(cleaned_txt))}
+    utt2text = dict(sorted(utt2text.items()))
+    vocab = set()
+    with open(filelists_path + '/text', 'w', encoding='utf-8') as f:
+        for x, y in utt2text.items():
+            for c in y: vocab.add(c)
+            f.write(x + ' ' +  y + '\n')

env.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+import shutil
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))

filelists/all_spks/eval_utts.txt ADDED Viewed

	@@ -0,0 +1,3730 @@

+1263201035_surprise_47861
+1263201035_angry_49695
+1263201035_happy_52657
+1263201035_surprise_40339
+1263201035_angry_7782
+1263201035_sad_48693
+1263201035_happy_50748
+1263201035_fear_53711
+1263201035_sad_73370
+1263201035_surprise_40186
+1263201035_neutral_40342
+1263201035_happy_66930
+1263201035_fear_67139
+1263201035_angry_66597
+1263201035_sad_11219
+1263201035_neutral_53029
+1263201035_sad_51009
+1263201035_happy_31206
+1263201035_fear_34063
+1263201035_neutral_75999
+1263201035_angry_33304
+1263201035_angry_33668
+1263201035_angry_29852
+1263201035_sad_73176
+1263201035_fear_8425
+1263201035_fear_40145
+1263201035_sad_73359
+1263201035_sad_31284
+1263201035_fear_49804
+1263201035_fear_30407
+1263201035_surprise_66220
+1263201035_angry_49725
+1263201035_angry_75638
+1263201035_neutral_67147
+1263201035_angry_34047
+1263201035_surprise_51008
+1263201035_fear_40255
+1263201035_happy_40037
+1263201035_sad_40178
+1263201035_sad_49944
+1263201035_neutral_52556
+1263201035_sad_32732
+1263201035_angry_67696
+1263201035_neutral_52335
+1263201035_surprise_33940
+1263201035_surprise_31598
+1263201035_happy_33813
+1263201035_happy_52477
+1263201035_neutral_8508
+1263201035_happy_22572
+1263201035_neutral_34101
+1263201035_happy_4851
+1263201035_angry_33831
+1263201035_happy_47841
+1263201035_angry_48038
+1263201035_angry_4986
+1263201035_surprise_29809
+1263201035_sad_8464
+1263201035_fear_47763
+1263201035_happy_29035
+1263201035_fear_16121
+1263201035_neutral_4901
+1263201035_surprise_51233
+1263201035_sad_33897
+1263201035_happy_50855
+1263201035_neutral_31468
+1263201035_happy_5320
+1263201035_neutral_66915
+1263201035_sad_67518
+1263201035_sad_5335
+1263201035_sad_52022
+1263201035_neutral_42295
+1263201035_happy_63621
+1263201035_happy_40137
+1263201035_happy_33655
+1263201035_neutral_66444
+1263201035_fear_66382
+1263201035_surprise_72795
+1263201035_happy_30297
+1263201035_angry_8319
+1263201035_neutral_65132
+1263201035_happy_31356
+1263201035_neutral_39066
+1263201035_surprise_47951
+1263201035_happy_33626
+1263201035_sad_50658
+1263201035_sad_53223
+1263201035_happy_30012
+1263201035_fear_40340
+1263201035_angry_15112
+1263201035_happy_67544
+1263201035_happy_51390
+1263201035_angry_51499
+1263201035_sad_29077
+1263201035_angry_75293
+1263201035_angry_16061
+1263201035_surprise_47777
+1263201035_happy_72797
+1263201035_angry_30428
+1263201035_neutral_53208
+1263201035_angry_53197
+1263201035_happy_75643
+1263201035_neutral_10824
+1263201035_happy_49642
+1263201035_surprise_29714
+1263201035_sad_50636
+1263201035_happy_10510
+1263201035_sad_33891
+1263201035_happy_52624
+1263201035_neutral_67354
+1263201035_angry_48056
+1263201035_surprise_75526
+1263201035_sad_75797
+1263201035_fear_4619
+1263201035_fear_67891
+1263201035_sad_31528
+1263201035_happy_32640
+1263201035_neutral_4164
+1263201035_neutral_8352
+1263201035_surprise_50242
+1263201035_sad_4376
+1263201035_sad_50810
+1263201035_neutral_51154
+1263201035_neutral_4966
+1263201035_fear_49929
+1263201035_angry_53663
+1263201035_angry_38901
+1263201035_surprise_9306
+1263201035_surprise_29785
+1263201035_angry_32730
+1263201035_happy_30869
+1263201035_angry_75682
+1263201035_happy_75952
+1263201035_fear_4576
+1263201035_neutral_63328
+1263201035_fear_50244
+1263201035_surprise_51865
+1263201035_fear_22543
+1263201035_angry_73324
+1263201035_sad_72922
+1263201035_fear_29157
+1263201035_fear_31299
+1263201035_neutral_38837
+1263201035_neutral_29108
+1263201035_neutral_52324
+1263201035_neutral_5031
+1263201035_surprise_75767
+1263201035_surprise_52615
+1263201035_happy_39662
+1263201035_neutral_30912
+1263201035_fear_72966
+1263201035_neutral_39697
+1263201035_happy_29865
+1263201035_sad_52970
+1263201035_sad_53429
+1263201035_angry_38884
+1263201035_surprise_66140
+1263201035_angry_39846
+1263201035_happy_10659
+1263201035_happy_40112
+1263201035_neutral_7703
+1263201035_happy_32874
+1263201035_fear_63714
+1263201035_sad_53877
+1263201035_fear_5013
+1263201035_fear_29773
+1263201035_happy_5066
+1263201035_sad_38392
+1263201035_surprise_39435
+1263201035_angry_32501
+1263201035_angry_73202
+1263201035_sad_11121
+1263201035_angry_66478
+1263201035_angry_75908
+1263201035_happy_9694
+1263201035_surprise_11194
+1263201035_sad_5051
+1263201035_angry_32308
+1263201035_surprise_5246
+1263201035_angry_74571
+1263201035_angry_75150
+1263201035_angry_16113
+1263201035_neutral_30884
+1263201035_surprise_33211
+1263201035_happy_5227
+1263201035_fear_15372
+1263201035_surprise_32862
+1263201035_neutral_53264
+1263201035_fear_66575
+1263201035_neutral_50966
+1263201035_angry_39916
+1263201035_happy_63102
+1263201035_surprise_51553
+1263201035_angry_39384
+1263201035_angry_15549
+1263201035_angry_33817
+1263201035_angry_10574
+1263201035_neutral_52331
+1263201035_sad_33469
+1263201035_fear_52564
+1263201035_fear_8481
+1263201035_sad_10794
+1263201035_fear_12090
+1263201035_happy_31695
+1263201035_sad_34144
+1263201035_surprise_75196
+1263201035_angry_30083
+1263201035_surprise_22350
+1263201035_happy_4269
+1263201035_angry_29368
+1263201035_happy_48044
+1263201035_neutral_30394
+1263201035_sad_74919
+1263201035_sad_47862
+1263201035_fear_73319
+1263201035_sad_50492
+1263201035_angry_53647
+1263201035_angry_30492
+1263201035_fear_51381
+1263201035_neutral_72947
+1263201035_sad_75433
+1263201035_neutral_5895
+1263201035_surprise_12083
+1263201035_sad_67908
+1263201035_surprise_39408
+1263201035_sad_72934
+1263201035_happy_66950
+1263201035_happy_67808
+1263201035_angry_32407
+1263201035_sad_49959
+1263201035_happy_51697
+1263201035_fear_53931
+1263201035_angry_67344
+1263201035_sad_8541
+1263201035_angry_32480
+1263201035_happy_66906
+1263201035_neutral_33447
+1263201035_happy_32179
+1263201035_neutral_30333
+1263201035_fear_30788
+1263201035_surprise_51693
+1263201035_sad_51616
+1263201035_neutral_52096
+1263201035_angry_33956
+1263201035_angry_66388
+1263201035_fear_39108
+1263201035_surprise_63535
+1263201035_surprise_50326
+1263201035_neutral_5142
+1263201035_neutral_30199
+1263201035_surprise_75829
+1263201035_surprise_5900
+1263201035_neutral_39387
+1263201035_happy_10858
+1263201035_fear_52243
+1263201035_neutral_52010
+1263201035_angry_63199
+1263201035_happy_49874
+1263201035_angry_48000
+1263201035_sad_8313
+1263201035_happy_66343
+1263201035_angry_63130
+1263201035_neutral_32809
+1263201035_sad_39396
+1263201035_angry_22986
+1263201035_neutral_42307
+1263201035_angry_72815
+1263201035_angry_67240
+1263201035_surprise_47957
+1263201035_angry_49812
+1263201035_angry_65040
+1263201035_neutral_50844
+1263201035_happy_39561
+1263201035_neutral_66254
+1263201035_angry_4584
+1263201035_fear_75581
+1263201035_surprise_40034
+1263201035_sad_32638
+1263201035_angry_31511
+1263201035_fear_40023
+1263201035_angry_66319
+1263201035_sad_7712
+1263201035_sad_38422
+1263201035_fear_49782
+1263201035_happy_30429
+1263201035_surprise_66374
+1263201035_neutral_31308
+1263201035_surprise_52893
+1263201035_happy_34030
+1263201035_sad_31258
+1263201035_happy_17691
+1263201035_happy_50449
+1263201035_surprise_21744
+1263201035_surprise_51942
+1263201035_happy_29092
+1263201035_angry_38434
+1263201035_happy_30887
+1263201035_neutral_40141
+1263201035_neutral_73264
+1263201035_angry_74880
+1263201035_surprise_51396
+1263201035_angry_31619
+1263201035_neutral_50160
+1263201035_happy_30843
+1263201035_angry_66937
+1263201035_surprise_48633
+1263201035_neutral_50437
+1263201035_fear_32622
+1263201035_sad_33602
+1263201035_sad_10693
+1263201035_angry_30010
+1263201035_sad_37431
+1263201035_sad_53370
+1263201035_sad_31638
+1263201035_sad_5883
+1263201035_sad_11196
+1263201035_angry_33837
+1263201035_neutral_15588
+1263201035_fear_51092
+1263201035_angry_12025
+1263201035_neutral_31454
+1263201035_surprise_51114
+1263201035_angry_38674
+1263201035_surprise_75241
+1263201035_angry_51729
+1263201035_surprise_51011
+1263201035_angry_63670
+1263201035_happy_4582
+1263201035_sad_15132
+1263201035_fear_74586
+1263201035_neutral_22964
+1263201035_neutral_66490
+1263201035_angry_39989
+1263201035_neutral_30098
+1263201035_fear_33875
+1263201035_surprise_73089
+1263201035_angry_67716
+1263201035_neutral_63424
+1263201035_fear_31191
+1263201035_happy_50178
+1263201035_neutral_48535
+1263201035_sad_29939
+1263201035_surprise_52514
+1263201035_surprise_14772
+1263201035_neutral_33820
+1263201035_neutral_51319
+1263201035_fear_19112
+1263201035_sad_30896
+1263201035_angry_38773
+1263201035_surprise_49855
+1263201035_angry_67517
+1263201035_sad_52937
+1263201035_sad_34274
+1263201035_angry_38860
+1263201035_angry_67267
+1263201035_fear_39004
+1263201035_happy_38922
+1263201035_surprise_32569
+1263201035_happy_32071
+1263201035_neutral_52193
+1263201035_fear_40222
+1263201035_sad_30995
+1263201035_neutral_42293
+1263201035_happy_48597
+1263201035_fear_74644
+1263201035_angry_4478
+1263201035_fear_4473
+1263201035_happy_52573
+1263201035_happy_31639
+1263201035_fear_67880
+1263201035_fear_63680
+1263201035_neutral_8662
+1263201035_neutral_11011
+1263201035_sad_51190
+1263201035_angry_67476
+1263201035_sad_66335
+1263201035_neutral_38929
+1263201035_surprise_4976
+1263201035_surprise_4181
+1263201035_angry_75223
+1263201035_fear_51118
+1263201035_fear_33610
+1263201035_sad_73105
+1263201035_sad_75988
+1263201035_happy_52925
+1263201035_surprise_73234
+1263201035_fear_8574
+1263201035_happy_10865
+1263201035_neutral_42311
+1263201035_sad_10413
+1263201035_happy_30812
+1263201035_neutral_29997
+1263201035_neutral_10538
+1263201035_happy_72962
+1263201035_fear_32169
+1263201035_sad_31600
+1263201035_fear_51048
+1263201035_happy_67949
+1263201035_happy_15173
+1263201035_surprise_39861
+1263201035_sad_53536
+1263201035_surprise_67765
+1263201035_happy_39074
+1263201035_neutral_50745
+1263201035_surprise_52274
+1263201035_angry_38765
+1263201035_angry_33487
+1263201035_neutral_73005
+1263201035_fear_32574
+1263201035_neutral_50283
+1263201035_fear_74772
+1263201035_angry_53856
+1263201035_angry_53403
+1263201035_sad_66511
+1263201035_sad_31329
+1263201035_surprise_75834
+1263201035_sad_66349
+1263201035_angry_38866
+1263201035_angry_8642
+1263201035_happy_4672
+1263201035_neutral_30997
+1263201035_neutral_51490
+1263201035_happy_15567
+1263201035_surprise_75842
+1263201035_happy_40352
+1263201035_happy_50029
+1263201035_surprise_11414
+1263201035_fear_73349
+1263201035_neutral_53602
+1263201035_neutral_72894
+1263201035_surprise_75663
+1263201035_surprise_53224
+1263201035_happy_16039
+1263201035_happy_30270
+1263201035_sad_31155
+1263201035_angry_5917
+1263201035_angry_11185
+1263201035_happy_50051
+1263201035_angry_4802
+1263201035_neutral_51275
+1263201035_angry_10726
+1263201035_angry_50739
+1263201035_neutral_51548
+1263201035_neutral_63465
+1263201035_fear_4140
+1263201035_happy_17671
+1263201035_surprise_51062
+1263201035_neutral_38910
+1263201035_angry_33486
+1263201035_surprise_19100
+1263201035_neutral_67662
+1263201035_happy_66974
+1263201035_fear_51238
+1263201035_angry_52782
+1263201035_surprise_9292
+1263201035_happy_51591
+1263201035_neutral_66318
+1263201035_sad_29226
+1263201035_happy_51129
+1263201035_fear_75378
+1263201035_happy_39023
+1263201035_neutral_39436
+1263201035_fear_10860
+1263201035_neutral_40155
+1263201035_angry_32592
+1263201035_angry_29315
+1263201035_neutral_51084
+1263201035_angry_50960
+1263201035_fear_47808
+1263201035_happy_8607
+1263201035_sad_11327
+1263201035_surprise_52222
+1263201035_fear_29913
+1263201035_sad_5146
+1263201035_neutral_30035
+1263201035_surprise_30070
+1263201035_fear_75890
+1263201035_surprise_73318
+1263201035_sad_50046
+1263201035_happy_52952
+1263201035_surprise_38590
+1263201035_neutral_53267
+1263201035_sad_11090
+1263201035_angry_51738
+1263201035_surprise_22919
+1263201035_happy_30616
+1263201035_fear_29919
+1263201035_angry_7795
+1263201035_happy_47740
+1263201035_surprise_61359
+1263201035_angry_50154
+1263201035_sad_38403
+1263201035_angry_49747
+1263201035_happy_8980
+1263201035_neutral_29259
+1263201035_happy_50894
+1263201035_neutral_38427
+1263201035_angry_32762
+1263201035_fear_32611
+1263201035_surprise_15625
+1263201035_angry_5941
+1263201035_surprise_4634
+1263201035_happy_29902
+1263201035_surprise_30956
+1263201035_angry_29974
+1263201035_angry_15543
+1263201035_sad_34067
+1263201035_angry_38994
+1263201035_happy_31659
+1263201035_sad_49751
+1263201035_neutral_42323
+1263201035_happy_15179
+1263201035_neutral_42271
+1263201035_surprise_33955
+1263201035_surprise_29371
+1263201035_happy_40384
+1263201035_happy_67210
+1263201035_surprise_50591
+1263201035_surprise_10695
+1263201035_fear_52784
+1263201035_angry_31103
+1263201035_neutral_33319
+1263201035_surprise_67540
+1263201035_sad_33438
+1263201035_sad_34190
+1263201035_angry_37389
+1263201035_angry_67648
+1263201035_sad_9261
+1263201035_surprise_51613
+1263201035_surprise_34010
+1263201035_fear_52470
+1263201035_surprise_75791
+1263201035_sad_50236
+1263201035_happy_21755
+1263201035_happy_74631
+1263201035_sad_10810
+1263201035_angry_4649
+1263201035_happy_10626
+1263201035_angry_32304
+1263201035_happy_32067
+1263201035_happy_50837
+1263201035_sad_75411
+1263201035_fear_30475
+1263201035_happy_32041
+1263201035_happy_52415
+1263201035_surprise_4047
+1263201035_surprise_54005
+1263201035_neutral_49754
+1263201035_happy_29275
+1263201035_neutral_50333
+1263201035_fear_53725
+1263201035_neutral_52524
+1263201035_fear_50916
+1263201035_fear_5315
+1263201035_happy_52798
+1263201035_neutral_52196
+1263201035_happy_33938
+1263201035_fear_33739
+1263201035_angry_11462
+1263201035_sad_51566
+1263201035_happy_30105
+1263201035_happy_11070
+1263201035_sad_9305
+1263201035_fear_39009
+1263201035_neutral_39391
+1263201035_neutral_10809
+1263201035_angry_10619
+1263201035_happy_30769
+1263201035_sad_10930
+1263201035_sad_39660
+1263201035_sad_38761
+1263201035_happy_52390
+1263201035_happy_33704
+1263201035_sad_10674
+1263201035_happy_39114
+1263201035_sad_63610
+1263201035_angry_29236
+1263201035_neutral_75306
+1263201035_sad_49859
+1263201035_happy_66954
+1263201035_sad_4561
+1263201035_neutral_67885
+1263201035_surprise_49932
+1263201035_fear_66572
+1263201035_fear_40182
+1263201035_fear_39663
+1263201035_fear_33880
+1263201035_happy_67859
+1263201035_angry_33432
+1263201035_surprise_38825
+1263201035_angry_8461
+1263201035_fear_38655
+1263201035_neutral_65129
+1263201035_surprise_30470
+1263201035_neutral_67230
+1263201035_surprise_50412
+1263201035_fear_52299
+1263201035_angry_47800
+1263201035_surprise_66201
+1263201035_surprise_50598
+1263201035_fear_37415
+1263201035_fear_38518
+1263201035_angry_11491
+1263201035_surprise_32608
+1263201035_sad_34053
+1263201035_neutral_51911
+1263201035_angry_53184
+1263201035_neutral_51589
+1263201035_fear_67342
+1263201035_surprise_52953
+1263201035_sad_4605
+1263201035_sad_73258
+1263201035_neutral_66608
+1263201035_angry_52674
+1263201035_angry_67578
+1263201035_sad_51155
+1263201035_sad_4606
+1263201035_angry_29223
+1263201035_neutral_73013
+1263201035_fear_52685
+1263201035_neutral_5283
+1263201035_sad_32334
+1263201035_angry_75302
+1263201035_happy_49920
+1263201035_happy_54008
+1263201035_neutral_67061
+1263201035_sad_49797
+1263201035_surprise_51156
+1263201035_angry_8672
+1263201035_happy_63096
+1263201035_surprise_53457
+1263201035_neutral_30327
+1263201035_surprise_40375
+1263201035_happy_10936
+1263201035_surprise_52478
+1263201035_fear_4357
+1263201035_surprise_52205
+1263201035_neutral_31415
+1263201035_sad_53522
+1263201035_fear_66924
+1263201035_happy_47732
+1263201035_surprise_50467
+1263201035_surprise_48600
+1263201035_happy_39388
+1263201035_surprise_29915
+1263201035_sad_5050
+1263201035_surprise_75795
+1263201035_sad_51383
+1263201035_happy_4064
+1263201035_fear_50726
+1263201035_angry_63173
+1263201035_happy_32433
+1263201035_sad_10906
+1263201035_sad_10473
+1263201035_angry_4508
+1263201035_sad_30183
+1263201035_happy_51659
+1263201035_fear_53855
+1263201035_angry_31510
+1263201035_sad_51020
+1263201035_angry_53929
+1263201035_sad_5950
+1263201035_surprise_38767
+1263201035_happy_53738
+1263201035_sad_4135
+1263201035_sad_49904
+1263201035_happy_38638
+1263201035_sad_30370
+1263201035_surprise_51560
+1263201035_sad_38692
+1263201035_happy_74798
+1263201035_neutral_51647
+1263201035_surprise_4127
+1263201035_neutral_10877
+1263201035_fear_63567
+1263201035_happy_47925
+1263201035_neutral_51950
+1263201035_angry_74630
+1263201035_neutral_32506
+1263201035_sad_49731
+1263201035_neutral_51998
+1263201035_angry_10616
+1263201035_angry_30147
+1263201035_happy_40011
+1263201035_angry_74777
+1263201035_happy_10970
+1263201035_neutral_75328
+1263201035_fear_75282
+1263201035_neutral_29926
+1263201035_surprise_72885
+1263201035_sad_10480
+1263201035_happy_50866
+1263201035_neutral_67096
+1263201035_happy_39940
+1263201035_sad_67938
+1263201035_fear_33987
+1263201035_neutral_50587
+1263201035_sad_39223
+1263201035_sad_33473
+1263201035_neutral_52616
+1263201035_fear_52371
+1263201035_sad_67174
+1263201035_fear_15385
+1263201035_fear_4320
+1263201035_sad_39368
+1263201035_angry_39609
+1263201035_neutral_52836
+1263201035_happy_5200
+1263201035_angry_51696
+1263201035_fear_50801
+1263201035_surprise_53958
+1263201035_surprise_4593
+1263201035_surprise_53922
+1263201035_fear_48016
+1263201035_angry_38926
+1263201035_fear_66482
+1263201035_sad_5092
+1263201035_fear_32490
+1263201035_angry_39629
+1263201035_neutral_52601
+1263201035_angry_33507
+1263201035_sad_34127
+1263201035_surprise_12029
+1263201035_surprise_42342
+1263201035_neutral_29846
+1263201035_fear_39213
+1263201035_neutral_11120
+1263201035_happy_39822
+1263201035_fear_39381
+1263201035_fear_39017
+1263201035_happy_73224
+1263201035_sad_61379
+1263201035_happy_10957
+1263201035_angry_51352
+1263201035_fear_32160
+1263201035_fear_74891
+1263201035_surprise_66547
+1263201035_fear_9627
+1263201035_sad_22324
+1263201035_fear_15666
+1263201035_surprise_10849
+1263201035_neutral_75450
+1263201035_angry_15456
+1263201035_angry_50370
+1263201035_sad_47926
+1263201035_surprise_74710
+1263201035_surprise_37413
+1263201035_fear_10891
+1263201035_angry_67451
+1263201035_angry_39127
+1263201035_surprise_37430
+1263201035_happy_75968
+1263201035_fear_12057
+1263201035_surprise_8355
+1263201035_happy_32446
+1263201035_angry_67687
+1263201035_happy_11047
+1263201035_happy_67814
+1263201035_surprise_39356
+1263201035_neutral_67629
+1263201035_surprise_37426
+1263201035_happy_39470
+1263201035_neutral_63631
+1263201035_happy_8625
+1263201035_happy_4360
+1263201035_neutral_4748
+1263201035_neutral_30972
+1263201035_angry_4056
+1263201035_happy_75468
+1263201035_surprise_50870
+1263201035_sad_72913
+1263201035_angry_75283
+1263201035_angry_22399
+1263201035_happy_11411
+1263201035_fear_30712
+1263201035_angry_67734
+1263201035_fear_53207
+1263201035_surprise_34014
+1263201035_sad_32097
+1263201035_surprise_32499
+1263201035_angry_51841
+1263201035_sad_48092
+1263201035_angry_51604
+1263201035_surprise_32639
+1263201035_happy_29322
+1263201035_angry_50800
+1263201035_angry_38585
+1263201035_sad_8663
+1263201035_neutral_73055
+1263201035_neutral_30175
+1263201035_surprise_10746
+1263201035_fear_67987
+1263201035_neutral_38696
+1263201035_happy_4111
+1263201035_sad_53482
+1263201035_neutral_67903
+1263201035_happy_10908
+1263201035_neutral_39744
+1263201035_neutral_53926
+1263201035_neutral_8659
+1263201035_sad_29120
+1263201035_surprise_50710
+1263201035_fear_51120
+1263201035_happy_4935
+1263201035_angry_48072
+1263201035_neutral_22526
+1263201035_surprise_53727
+1263201035_neutral_48696
+1263201035_happy_11025
+1263201035_fear_32724
+1263201035_neutral_51717
+1263201035_fear_67373
+1263201035_surprise_47816
+1263201035_surprise_4204
+1263201035_angry_4557
+1263201035_surprise_63522
+1263201035_fear_51763
+1263201035_angry_51357
+1263201035_fear_52253
+1263201035_sad_39630
+1263201035_angry_67542
+1263201035_neutral_52922
+1263201035_sad_50038
+1263201035_happy_21771
+1263201035_fear_67007
+1263201035_sad_52036
+1263201035_happy_63404
+1263201035_happy_38856
+1263201035_happy_53465
+1263201035_angry_10682
+1263201035_neutral_38970
+1263201035_happy_9683
+1263201035_angry_29335
+1263201035_neutral_66557
+1263201035_angry_22385
+1263201035_surprise_4495
+1263201035_sad_53679
+1263201035_sad_30850
+1263201035_angry_30435
+1263201035_happy_29127
+1263201035_sad_39093
+1263201035_surprise_10658
+1263201035_fear_30336
+1263201035_neutral_48079
+1263201035_happy_50610
+1263201035_fear_66556
+1263201035_fear_49637
+1263201035_sad_51215
+1263201035_neutral_40024
+1263201035_happy_52792
+1263201035_sad_11119
+1263201035_neutral_8440
+1263201035_fear_50821
+1263201035_fear_53650
+1263201035_sad_53425
+1263201035_angry_53700
+1263201035_neutral_52044
+1263201035_happy_22580
+1263201035_fear_22364
+1263201035_happy_8679
+1263201035_sad_53988
+1263201035_neutral_31413
+1263201035_fear_40249
+1263201035_sad_37440
+1263201035_neutral_33742
+1263201035_happy_31053
+1263201035_neutral_8499
+1263201035_angry_34247
+1263201035_angry_11311
+1263201035_surprise_39003
+1263201035_neutral_52969
+1263201035_fear_72869
+1263201035_fear_7708
+1263201035_angry_51363
+1263201035_sad_30397
+1263201035_surprise_34161
+1263201035_surprise_53719
+1263201035_surprise_12099
+1263201035_fear_10556
+1263201035_surprise_11037
+1263201035_happy_39705
+1263201035_happy_4830
+1263201035_neutral_4293
+1263201035_sad_52958
+1263201035_surprise_29677
+1263201035_fear_39159
+1263201035_sad_72860
+1263201035_angry_29082
+1263201035_sad_39742
+1263201035_sad_30599
+1263201035_neutral_67199
+1263201035_sad_40113
+1263201035_neutral_39703
+1263201035_surprise_34137
+1263201035_angry_33762
+1263201035_neutral_73037
+1263201035_sad_11495
+1263201035_happy_19082
+1263201035_neutral_50490
+1263201035_surprise_29721
+1263201035_angry_40175
+1263201035_sad_38404
+1263201035_neutral_39049
+1263201035_neutral_75185
+1263201035_sad_32699
+1263201035_neutral_53367
+1263201035_happy_53621
+1263201035_fear_38412
+1263201035_sad_52229
+1263201035_fear_52862
+1263201035_happy_63544
+1263201035_neutral_50509
+1263201035_fear_10623
+1263201035_surprise_51898
+1263201035_angry_33174
+1263201035_fear_67464
+1263201035_angry_5344
+1263201035_happy_73012
+1263201035_fear_53813
+1263201035_fear_50814
+1263201035_happy_74686
+1263201035_neutral_40154
+1263201035_fear_11105
+1263201035_surprise_39897
+1263201035_surprise_53784
+1263201035_sad_4579
+1263201035_fear_50040
+1263201035_angry_39231
+1263201035_fear_63686
+1263201035_angry_42327
+1263201035_sad_75287
+1263201035_neutral_73073
+1263201035_surprise_39412
+1263201035_sad_8361
+1263201035_surprise_4093
+1263201035_neutral_51817
+1263201035_fear_52576
+1263201035_fear_52801
+1263201035_angry_73261
+1263201035_surprise_51022
+1263201035_surprise_52450
+1263201035_happy_61374
+1263201035_sad_15168
+1263201035_neutral_5281
+1263201035_sad_50045
+1263201035_happy_11084
+1263201035_neutral_15650
+1263201035_fear_11348
+1263201035_happy_30681
+1263201035_surprise_67291
+1263201035_sad_75590
+1263201035_sad_22374
+1263201035_happy_39178
+1263201035_angry_50170
+1263201035_neutral_50965
+1263201035_surprise_30201
+1263201035_sad_63504
+1263201035_neutral_63609
+1263201035_sad_29253
+1263201035_sad_48656
+1263201035_fear_30352
+1263201035_angry_8474
+1263201035_neutral_31460
+1263201035_sad_10715
+1263201035_fear_50105
+1263201035_surprise_50213
+1263201035_angry_40299
+1263201035_surprise_74656
+1263201035_happy_32652
+1263201035_fear_32492
+1263201035_happy_12024
+1263201035_neutral_29155
+1263201035_happy_74939
+1263201035_fear_38953
+1263201035_happy_8491
+1263201035_fear_21711
+1263201035_happy_5324
+1263201035_angry_10738
+1263201035_neutral_15651
+1263201035_neutral_50824
+1263201035_sad_53079
+1263201035_surprise_30571
+1263201035_fear_51556
+1263201035_happy_52539
+1263201035_neutral_8922
+1263201035_fear_53631
+1263201035_surprise_29318
+1263201035_fear_15464
+1263201035_neutral_8471
+1263201035_sad_53722
+1263201035_neutral_52716
+1263201035_fear_39748
+1263201035_surprise_32056
+1263201035_sad_75441
+1263201035_angry_15533
+1263201035_sad_10515
+1263201035_happy_5089
+1263201035_happy_39815
+1263201035_surprise_30818
+1263201035_sad_31068
+1263201035_surprise_52029
+1263201035_happy_8596
+1263201035_surprise_74604
+1263201035_angry_53024
+1263201035_angry_4343
+1263201035_sad_49994
+1263201035_neutral_61343
+1263201035_happy_52354
+1263201035_fear_63597
+1263201035_neutral_31698
+1263201035_happy_29320
+1263201035_neutral_74910
+1263201035_neutral_34222
+1263201035_fear_4507
+1263201035_fear_48560
+1263201035_angry_67619
+1263201035_sad_66395
+1263201035_fear_39996
+1263201035_angry_22530
+1263201035_surprise_30548
+1263201035_neutral_15605
+1263201035_surprise_63371
+1263201035_surprise_38823
+1263201035_surprise_74805
+1263201035_angry_67907
+1263201035_happy_75280
+1263201035_angry_63548
+1263201035_surprise_51948
+1263201035_angry_73214
+1263201035_happy_30300
+1263201035_happy_67102
+1263201035_neutral_49728
+1263201035_fear_72899
+1263201035_happy_48069
+1263201035_happy_48519
+1263201035_happy_61333
+1263201035_fear_66977
+1263201035_angry_10814
+1263201035_happy_48081
+1263201035_sad_75563
+1263201035_sad_53355
+1263201035_neutral_51098
+1263201035_neutral_74775
+1263201035_happy_67853
+1263201035_happy_31527
+1263201035_happy_8465
+1263201035_angry_66923
+1263201035_sad_50352
+1263201035_happy_39793
+1263201035_angry_66357
+1263201035_angry_48646
+1263201035_fear_40016
+1263201035_neutral_33407
+1263201035_angry_33749
+1263201035_surprise_53042
+1263201035_happy_72808
+1263201035_sad_51494
+1263201035_happy_50609
+1263201035_angry_49643
+1263201035_angry_50679
+1263201035_angry_50596
+1263201035_neutral_33504
+1263201035_neutral_31225
+1263201035_happy_4894
+1263201035_sad_63614
+1263201035_fear_75770
+1263201035_surprise_74857
+1263201035_fear_63685
+1263201035_fear_30893
+1263201035_happy_40166
+1263201035_fear_49690
+1263201035_neutral_67588
+1263201035_happy_49611
+1263201035_happy_32174
+1263201035_fear_29229
+1263201035_fear_5235
+1263201035_neutral_67976
+1263201035_angry_73084
+1263201035_neutral_75461
+1263201035_neutral_50528
+1263201035_angry_50770
+1263201035_happy_39706
+1263201035_surprise_75990
+1263201035_sad_51565
+1263201035_angry_52842
+1263201035_happy_67049
+1263201035_angry_32674
+1263201035_neutral_51873
+1263201035_angry_29853
+1263201035_neutral_4354
+1263201035_neutral_22436
+1263201035_fear_50720
+1263201035_surprise_61329
+1263201035_surprise_11261
+1263201035_sad_75848
+1263201035_sad_10699
+1263201035_angry_5123
+1263201035_fear_30592
+1263201035_fear_52675
+1263201035_happy_21738
+1263201035_sad_32243
+1263201035_sad_50072
+1263201035_angry_33455
+1263201035_neutral_52257
+1263201035_happy_31264
+1263201035_neutral_47904
+1263201035_surprise_53481
+1263201035_angry_52513
+1263201035_neutral_49974
+1263201035_happy_66913
+1263201035_surprise_29856
+1263201035_sad_52129
+1263201035_happy_21719
+1263201035_neutral_11030
+1263201035_neutral_50456
+1263201035_sad_48643
+1263201035_angry_9262
+1263201035_neutral_66143
+1263201035_surprise_75652
+1263201035_sad_30698
+1263201035_fear_75176
+1263201035_neutral_10977
+1263201035_neutral_39268
+1263201035_neutral_50285
+1263201035_neutral_39544
+1263201035_happy_32345
+1263201035_happy_30835
+1263201035_neutral_33950
+1263201035_angry_30278
+1263201035_happy_30273
+1263201035_happy_21756
+1263201035_neutral_73031
+1263201035_neutral_8546
+1263201035_surprise_73017
+1263201035_happy_66400
+1263201035_sad_31277
+1263201035_angry_22963
+1263201035_fear_66255
+1263201035_sad_29686
+1263201035_angry_52242
+1263201035_angry_11133
+1263201035_neutral_52828
+1263201035_surprise_16054
+1263201035_surprise_40191
+1263201035_sad_8419
+1263201035_sad_39968
+1263201035_neutral_8483
+1263201035_surprise_4264
+1263201035_fear_50230
+1263201035_angry_74736
+1263201035_surprise_32422
+1263201035_sad_32772
+1263201035_neutral_52947
+1263201035_sad_9273
+1263201035_happy_19086
+1263201035_happy_31004
+1263201035_sad_29151
+1263201035_angry_52912
+1263201035_sad_75398
+1263201035_angry_30187
+1263201035_surprise_22589
+1263201035_happy_30196
+1263201035_neutral_50272
+1263201035_surprise_67874
+1263201035_neutral_53547
+1263201035_sad_32095
+1263201035_fear_8954
+1263201035_surprise_38463
+1263201035_angry_4598
+1263201035_surprise_49615
+1263201035_sad_11282
+1263201035_fear_52581
+1263201035_surprise_31407
+1263201035_happy_31046
+1263201035_fear_33910
+1263201035_happy_63495
+1263201035_happy_30039
+1263201035_fear_40273
+1263201035_neutral_48080
+1263201035_surprise_32547
+1263201035_happy_50765
+1263201035_angry_50486
+1263201035_surprise_49764
+1263201035_happy_22920
+1263201035_neutral_32123
+1263201035_fear_39886
+1263201035_angry_10372
+1263201035_neutral_32294
+1263201035_happy_34162
+1263201035_surprise_51598
+1263201035_happy_49940
+1263201035_surprise_51929
+1263201035_happy_11369
+1263201035_fear_39832
+1263201035_angry_75741
+1263201035_sad_33423
+1263201035_angry_53479
+1263201035_fear_30130
+1263201035_neutral_53571
+1263201035_angry_49899
+1263201035_neutral_53474
+1263201035_surprise_75290
+1263201035_neutral_34254
+1263201035_neutral_51142
+1263201035_happy_17679
+1263201035_angry_39812
+1263201035_neutral_63663
+1263201035_fear_38451
+1263201035_happy_49941
+1263201035_surprise_32966
+1263201035_neutral_21763
+1263201035_surprise_19121
+1263201035_happy_31628
+1263201035_sad_29759
+1263201035_neutral_42335
+1263201035_happy_50507
+1263201035_surprise_75387
+1263201035_sad_39378
+1263201035_neutral_47967
+1263201035_fear_67521
+1263201035_sad_74885
+1263201035_surprise_40289
+1263201035_neutral_51735
+1263201035_neutral_52392
+1263201035_happy_67098
+1263201035_sad_11275
+1263201035_surprise_52919
+1263201035_surprise_52621
+1263201035_angry_11315
+1263201035_happy_38747
+1263201035_surprise_53653
+1263201035_happy_12056
+1263201035_neutral_11140
+1263201035_happy_39451
+1263201035_angry_15631
+1263201035_happy_29367
+1263201035_neutral_39607
+1263201035_surprise_63449
+1263201035_happy_16064
+1263201035_neutral_52837
+1263201035_surprise_40139
+1263201035_surprise_52982
+1263201035_sad_75831
+1263201035_surprise_4089
+1263201035_happy_33475
+1263201035_neutral_63551
+1263201035_angry_51373
+1263201035_surprise_32218
+1263201035_surprise_47731
+1263201035_angry_30846
+1263201035_happy_4517
+1263201035_sad_66406
+1263201035_neutral_66245
+1263201035_neutral_29757
+1263201035_fear_51975
+1263201035_neutral_33863
+1263201035_happy_32858
+1263201035_surprise_49925
+1263201035_fear_52563
+1263201035_neutral_39479
+1263201035_surprise_75967
+1263201035_neutral_73029
+1263201035_neutral_66247
+1263201035_surprise_49870
+1263201035_fear_29743
+1263201035_sad_32855
+1263201035_neutral_73239
+1263201035_neutral_42290
+1263201035_happy_49630
+1263201035_neutral_10406
+1263201035_fear_11309
+1263201035_sad_75808
+1263201035_surprise_50838
+1263201035_surprise_5135
+1263201035_surprise_38621
+1263201035_surprise_53225
+1263201035_neutral_31166
+1263201035_angry_66328
+1263201035_surprise_75594
+1263201035_happy_38487
+1263201035_surprise_66347
+1263201035_sad_49821
+1263201035_sad_38608
+1263201035_angry_50639
+1263201035_angry_72945
+1263201035_fear_53431
+1263201035_neutral_34239
+1263201035_neutral_14770
+1263201035_fear_31148
+1263201035_fear_66316
+1263201035_surprise_39983
+1263201035_neutral_73287
+1263201035_surprise_5045
+1263201035_angry_47908
+1263201035_fear_15109
+1263201035_neutral_40336
+1263201035_neutral_74641
+1263201035_surprise_4626
+1263201035_happy_38716
+1263201035_angry_52295
+1263201035_happy_4652
+1263201035_angry_34248
+1263201035_happy_49627
+1263201035_neutral_74738
+1263201035_neutral_75641
+1263201035_fear_8534
+1263201035_surprise_32292
+1263201035_surprise_8925
+1263201035_sad_30446
+1263201035_angry_8683
+1263201035_happy_65134
+1263201035_sad_75694
+1263201035_sad_9612
+1263201035_neutral_52400
+1263201035_fear_53500
+1263201035_surprise_32432
+1263201035_angry_33598
+1263201035_neutral_8594
+1263201035_happy_33634
+1263201035_sad_21752
+1263201035_fear_39499
+1263201035_happy_67085
+1263201035_angry_37435
+1263201035_neutral_30556
+1263201035_neutral_75609
+1263201035_surprise_31361
+1263201035_surprise_53792
+1263201035_sad_53994
+1263201035_neutral_75762
+1263201035_surprise_47742
+1263201035_sad_42334
+1263201035_sad_15165
+1263201035_fear_53975
+1263201035_angry_33597
+1263201035_happy_40059
+1263201035_angry_39229
+1263201035_sad_22405
+1263201035_sad_52287
+1263201035_surprise_53088
+1263201035_sad_33272
+1263201035_happy_15401
+1263201035_surprise_67149
+1263201035_angry_12030
+1263201035_fear_4892
+1263201035_happy_67525
+1263201035_happy_67269
+1263201035_sad_30193
+1263201035_sad_15695
+1263201035_surprise_32726
+1263201035_neutral_66239
+1263201035_happy_30784
+1263201035_angry_30596
+1263201035_happy_31525
+1263201035_sad_33907
+1263201035_angry_5185
+1263201035_neutral_47785
+1263201035_happy_39177
+1263201035_sad_47891
+1263201035_neutral_22577
+1263201035_neutral_75458
+1263201035_happy_40295
+1263201035_angry_49946
+1263201035_happy_38689
+1263201035_neutral_11015
+1263201035_neutral_49848
+1263201035_neutral_74879
+1263201035_sad_75525
+1263201035_fear_30439
+1263201035_angry_50534
+1263201035_neutral_22907
+1263201035_neutral_32594
+1263201035_sad_75409
+1263201035_surprise_33379
+1263201035_surprise_30802
+1263201035_neutral_30832
+1263201035_sad_10787
+1263201035_sad_19118
+1263201035_fear_32502
+1263201035_neutral_73193
+1263201035_angry_48064
+1263201035_fear_39971
+1263201035_neutral_38494
+1263201035_sad_16065
+1263201035_surprise_50463
+1263201035_happy_40235
+1263201035_neutral_74830
+1263201035_happy_50497
+1263201035_fear_40199
+1263201035_fear_15437
+1263201035_sad_51866
+1263201035_fear_51907
+1263201035_fear_52002
+1263201035_surprise_53699
+1263201035_fear_8298
+1263201035_fear_30662
+1263201035_fear_66463
+1263201035_angry_51570
+1263201035_surprise_31363
+1263201035_sad_63546
+1263201035_angry_8535
+1263201035_sad_66941
+1263201035_angry_40065
+1263201035_sad_30138
+1263201035_sad_40363
+1263201035_angry_29132
+1263201035_sad_4233
+1263201035_neutral_11263
+1263201035_sad_8634
+1263201035_surprise_30036
+1263201035_happy_30849
+1263201035_angry_10458
+1263201035_surprise_50205
+1263201035_happy_5925
+1263201035_neutral_75434
+1263201035_neutral_30555
+1263201035_angry_29949
+1263201035_fear_74642
+1263201035_fear_67495
+1263201035_sad_39233
+1263201035_sad_15136
+1263201035_sad_51463
+1263201035_angry_29747
+1263201035_angry_67176
+1263201035_neutral_33905
+1263201035_happy_29842
+1263201035_angry_66451
+1263201035_happy_33323
+1263201035_fear_50493
+1263201035_sad_51559
+1263201035_surprise_53746
+1263201035_angry_32509
+1263201035_angry_72799
+1263201035_sad_52967
+1263201035_surprise_32354
+1263201035_angry_22984
+1263201035_angry_75319
+1263201035_sad_31636
+1263201035_fear_47837
+1263201035_angry_21669
+1263201035_neutral_21725
+1263201035_sad_50777
+1263201035_surprise_17722
+1263201035_surprise_9679
+1263201035_angry_75278
+1263201035_fear_29045
+1263201035_neutral_38636
+1263201035_happy_30936
+1263201035_sad_8975
+1263201035_neutral_72984
+1263201035_surprise_29121
+1263201035_sad_31007
+1263201035_surprise_50325
+1263201035_neutral_8276
+1263201035_happy_53596
+1263201035_angry_29719
+1263201035_sad_10950
+1263201035_neutral_10925
+1263201035_fear_50809
+1263201035_happy_67625
+1263201035_fear_53014
+1263201035_neutral_10427
+1263201035_neutral_31146
+1263201035_neutral_53604
+1263201035_happy_51750
+1263201035_angry_4136
+1263201035_sad_50211
+1263201035_fear_49998
+1263201035_fear_74646
+1263201035_happy_50637
+1263201035_neutral_66955
+1263201035_neutral_67816
+1263201035_surprise_50148
+1263201035_fear_15425
+1263201035_fear_51563
+1263201035_surprise_48055
+1263201035_fear_11496
+1263201035_fear_4931
+1263201035_surprise_30442
+1263201035_happy_39465
+1263201035_fear_52327
+1263201035_happy_29118
+1263201035_surprise_30052
+1263201035_surprise_31501
+1263201035_sad_30962
+1263201035_sad_50753
+1263201035_surprise_53247
+1263201035_happy_67228
+1263201035_surprise_39670
+1263201035_neutral_33743
+1263201035_neutral_30148
+1263201035_sad_52140
+1263201035_sad_15545
+1263201035_surprise_52726
+1263201035_neutral_34187
+1263201035_happy_50068
+1263201035_happy_7787
+1263201035_happy_22360
+1263201035_neutral_51444
+1263201035_sad_47856
+1263201035_happy_67717
+1263201035_neutral_31494
+1263201035_angry_52974
+1263201035_sad_40028
+1263201035_happy_32949
+1263201035_sad_73025
+1263201035_sad_75547
+1263201035_neutral_37402
+1263201035_angry_39221
+1263201035_happy_22353
+1263201035_fear_33457
+1263201035_surprise_39582
+1263201035_happy_39796
+1263201035_angry_31679
+1263201035_surprise_38690
+1263201035_happy_38486
+1263201035_surprise_48682
+1263201035_surprise_53326
+1263201035_happy_52683
+1263201035_happy_15155
+1263201035_fear_38991
+1263201035_sad_11136
+1263201035_fear_4899
+1263201035_angry_29131
+1263201035_surprise_32403
+1263201035_surprise_39789
+1263201035_sad_33493
+1263201035_happy_31444
+1263201035_fear_15099
+1263201035_surprise_63711
+1263201035_fear_29954
+1263201035_angry_51225
+1263201035_happy_32396
+1263201035_happy_63255
+1263201035_happy_52343
+1263201035_fear_50385
+1263201035_sad_15188
+1263201035_happy_11314
+1263201035_surprise_30250
+1263201035_angry_5068
+1263201035_fear_16047
+1263201035_angry_53992
+1263201035_angry_51230
+1263201035_neutral_47844
+1263201035_neutral_51857
+1263201035_fear_11301
+1263201035_angry_73047
+1263201035_surprise_52813
+1263201035_angry_38888
+1263201035_fear_67850
+1263201035_surprise_52686
+1263201035_happy_73172
+1263201035_surprise_48507
+1263201035_angry_11009
+1263201035_angry_72928
+1263201035_neutral_22957
+1263201035_happy_9596
+1263201035_sad_38634
+1263201035_neutral_4752
+1263201035_surprise_17705
+1263201035_fear_32444
+1263201035_sad_63543
+1263201035_angry_11435
+1263201035_neutral_39724
+1263201035_sad_31255
+1263201035_sad_67185
+1263201035_happy_48649
+1263201035_happy_67035
+1263201035_neutral_10972
+1263201035_angry_50243
+1263201035_angry_39596
+1263201035_surprise_39782
+1263201035_neutral_10912
+1263201035_neutral_63286
+1263201035_sad_32040
+1263201035_angry_75388
+1263201035_surprise_53393
+1263201035_fear_32919
+1263201035_happy_10937
+1263201035_neutral_5022
+1263201035_neutral_50339
+1263201035_fear_38619
+1263201035_surprise_4880
+1263201035_neutral_51756
+1263201035_surprise_22917
+1263201035_angry_22444
+1263201035_neutral_40183
+1263201035_sad_53448
+1263201035_sad_34016
+1263201035_angry_50025
+1263201035_happy_31648
+1263201035_sad_63638
+1263201035_neutral_15674
+1263201035_happy_34278
+1263201035_surprise_38615
+1263201035_happy_67156
+1263201035_angry_32512
+1263201035_neutral_67005
+1263201035_fear_53534
+1263201035_fear_7796
+1263201035_neutral_37423
+1263201035_angry_51300
+1263201035_neutral_33853
+1263201035_sad_67241
+1263201035_happy_33310
+1263201035_angry_33421
+1263201035_surprise_52863
+1263201035_sad_11325
+1263201035_fear_10391
+1263201035_angry_63162
+1263201035_happy_29976
+1263201035_angry_37386
+1263201035_fear_74728
+1263201035_neutral_30552
+1263201035_angry_67461
+1263201035_happy_53967
+1263201035_fear_38672
+1263201035_happy_39237
+1263201035_surprise_51769
+1263201035_neutral_50987
+1263201035_angry_30494
+1263201035_sad_53352
+1263201035_happy_67582
+1263201035_happy_53572
+1263201035_neutral_32151
+1263201035_angry_29274
+1263201035_fear_52319
+1263201035_sad_51481
+1263201035_surprise_15779
+1263201035_surprise_32105
+1263201035_surprise_32967
+1263201035_neutral_30949
+1263201035_angry_34062
+1263201035_angry_50057
+1263201035_happy_4502
+1263201035_angry_11318
+1263201035_angry_73146
+1263201035_neutral_73275
+1263201035_fear_47776
+1263201035_neutral_39625
+1263201035_surprise_66337
+1263201035_fear_48617
+1263201035_fear_65128
+1263201035_neutral_33851
+1263201035_fear_29675
+1263201035_fear_66334
+1263201035_fear_37398
+1263201035_surprise_15198
+1263201035_happy_42333
+1263201035_neutral_4978
+1263201035_happy_21734
+1263201035_sad_53614
+1263201035_happy_52905
+1263201035_fear_73245
+1263201035_sad_52611
+1263201035_angry_74731
+1263201035_surprise_10694
+1263201035_surprise_10797
+1263201035_fear_75325
+1263201035_fear_31351
+1263201035_sad_53289
+1263201035_happy_29072
+1263201035_happy_53603
+1263201035_neutral_39605
+1263201035_happy_52739
+1263201035_surprise_29680
+1263201035_neutral_74608
+1263201035_fear_38423
+1263201035_fear_8557
+1263201035_angry_73179
+1263201035_surprise_33461
+1263201035_happy_4786
+1263201035_happy_10550
+1263201035_fear_10728
+1263201035_sad_50116
+1263201035_neutral_51442
+1263201035_angry_34263
+1263201035_sad_66207
+1263201035_happy_52471
+1263201035_neutral_73190
+1263201035_happy_10931
+1263201035_happy_50881
+1263201035_sad_74819
+1263201035_surprise_53552
+1263201035_surprise_5152
+1263201035_neutral_29288
+1263201035_sad_32220
+1263201035_angry_30348
+1263201035_happy_31684
+1263201035_neutral_33734
+1263201035_neutral_30577
+1263201035_angry_4763
+1263201035_angry_66994
+1263201035_happy_22536
+1263201035_surprise_67624
+1263201035_neutral_52700
+1263201035_surprise_38798
+1263201035_sad_75953
+1263201035_surprise_53210
+1263201035_sad_17997
+1263201035_surprise_51026
+1263201035_sad_39392
+1263201035_surprise_32463
+1263201035_sad_10665
+1263201035_sad_67411
+1263201035_surprise_50911
+1263201035_neutral_10705
+1263201035_sad_33826
+1263201035_neutral_49980
+1263201035_angry_50513
+1263201035_angry_17681
+1263201035_angry_50095
+1263201035_neutral_53825
+1263201035_surprise_15521
+1263201035_fear_73288
+1263201035_fear_38600
+1263201035_neutral_67558
+1263201035_happy_19085
+1263201035_angry_32184
+1263201035_fear_53329
+1263201035_happy_47847
+1263201035_happy_52108
+1263201035_happy_30451
+1263201035_fear_51744
+1263201035_sad_32414
+1263201035_sad_63383
+1263201035_happy_32060
+1263201035_sad_29790
+1263201035_happy_4874
+1263201035_fear_33449
+1263201035_surprise_31536
+1263201035_angry_66230
+1263201035_sad_32373
+1263201035_angry_67738
+1263201035_angry_31125
+1263201035_happy_11422
+1263201035_neutral_67134
+1263201035_angry_12120
+1263201035_angry_66973
+1263201035_surprise_32389
+1263201035_sad_51372
+1263201035_sad_16033
+1263201035_sad_50083
+1263201035_happy_38512
+1263201035_sad_51714
+1263201035_happy_4105
+1263201035_angry_47821
+1263201035_surprise_40133
+1263201035_angry_5903
+1263201035_surprise_29932
+1263201035_fear_50647
+1263201035_happy_67026
+1263201035_happy_52128
+1263201035_surprise_30234
+1263201035_neutral_50066
+1263201035_happy_66173
+1263201035_neutral_42287
+1263201035_neutral_75367
+1263201035_angry_39737
+1263201035_sad_53093
+1263201035_fear_38945
+1263201035_sad_50118
+1263201035_sad_31060
+1263201035_happy_4610
+1263201035_surprise_50075
+1263201035_angry_38429
+1263201035_surprise_31065
+1263201035_fear_66940
+1263201035_sad_31477
+1263201035_happy_47758
+1263201035_happy_11347
+1263201035_surprise_32602
+1263201035_angry_10766
+1263201035_neutral_10657
+1263201035_neutral_30911
+1263201035_happy_32306
+1263201035_surprise_30237
+1263201035_angry_53915
+1263201035_happy_4700
+1263201035_happy_67227
+1263201035_neutral_67644
+1263201035_happy_29733
+1263201035_neutral_31161
+1263201035_fear_67963
+1263201035_neutral_53259
+1263201035_fear_10419
+1263201035_sad_75141
+1263201035_surprise_38784
+1263201035_fear_63231
+1263201035_surprise_32747
+1263201035_happy_29760
+1263201035_fear_8599
+1263201035_fear_38858
+1263201035_happy_47792
+1263201035_angry_53676
+1263201035_angry_39283
+1263201035_sad_30117
+1263201035_fear_53640
+1263201035_neutral_67057
+1263201035_happy_4171
+1263201035_surprise_31318
+1263201035_surprise_63260
+1263201035_happy_50504
+1263201035_happy_31343
+1263201035_neutral_11072
+1263201035_neutral_53831
+1263201035_neutral_67047
+1263201035_neutral_66435
+1263201035_surprise_30079
+1263201035_fear_38508
+1263201035_neutral_8310
+1263201035_sad_4119
+1263201035_happy_50368
+1263201035_neutral_10445
+1263201035_sad_5144
+1263201035_neutral_53875
+1263201035_sad_10502
+1263201035_angry_51409
+1263201035_surprise_32553
+1263201035_angry_39628
+1263201035_fear_53141
+1263201035_sad_51147
+1263201035_neutral_51540
+1263201035_happy_39095
+1263201035_fear_16115
+1263201035_happy_38927
+1263201035_happy_48674
+1263201035_fear_48581
+1263201035_neutral_21713
+1263201035_happy_53694
+1263201035_fear_29119
+1263201035_neutral_52886
+1263201035_sad_51888
+1263201035_surprise_29753
+1263201035_sad_73362
+1263201035_sad_30916
+1263201035_surprise_66353
+1263201035_fear_33558
+1263201035_sad_8966
+1263201035_fear_72931
+1263201035_angry_15143
+1263201035_fear_53844
+1263201035_sad_4336
+1263201035_fear_4338
+1263201035_angry_22571
+1263201035_surprise_39277
+1263201035_neutral_74767
+1263201035_neutral_4782
+1263201035_neutral_4819
+1263201035_neutral_38808
+1263201035_happy_75490
+1263201035_happy_30624
+1263201035_angry_40055
+1263201035_surprise_33787
+1263201035_surprise_8553
+1263201035_sad_30808
+1263201035_sad_63063
+1263201035_angry_15781
+1263201035_neutral_4871
+1263201035_angry_29359
+1263201035_sad_50518
+1263201035_sad_38652
+1263201035_neutral_31313
+1263201035_surprise_49662
+1263201035_happy_32965
+1263201035_angry_54009
+1263201035_neutral_33694
+1263201035_neutral_32687
+1263201035_fear_75802
+1263201035_sad_34155
+1263201035_angry_66231
+1263201035_neutral_63197
+1263201035_happy_51291
+1263201035_angry_30901
+1263201035_surprise_22594
+1263201035_happy_50367
+1263201035_neutral_4551
+1263201035_happy_30444
+1263201035_sad_53731
+1263201035_fear_33893
+1263201035_fear_5074
+1263201035_neutral_66593
+1263201035_sad_67216
+1263201035_fear_29349
+1263201035_sad_4948
+1263201035_happy_67633
+1263201035_surprise_50361
+1263201035_happy_29674
+1263201035_neutral_52080
+1263201035_angry_48666
+1263201035_angry_33712
+1263201035_angry_75389
+1263201035_angry_50734
+1263201035_happy_52473
+1263201035_angry_50277
+1263201035_neutral_32094
+1263201035_fear_5165
+1263201035_fear_50217
+1263201035_angry_75595
+1263201035_angry_34257
+1263201035_sad_50938
+1263201035_fear_49748
+1263201035_happy_49685
+1263201035_neutral_50428
+1263201035_sad_15176
+1263201035_sad_9284
+1263201035_neutral_47760
+1263201035_neutral_29755
+1263201035_neutral_22570
+1263201035_neutral_53036
+1263201035_fear_22938
+1263201035_angry_75935
+1263201035_angry_52254
+1263201035_sad_38982
+1263201035_sad_21687
+1263201035_happy_52708
+1263201035_sad_30111
+1263201035_surprise_66431
+1263201035_fear_11494
+1263201035_neutral_32582
+1263201035_surprise_67486
+1263201035_happy_10524
+1263201035_sad_8362
+1263201035_neutral_39973
+1263201035_surprise_66250
+1263201035_angry_52839
+1263201035_angry_53851
+1263201035_surprise_67982
+1263201035_neutral_32647
+1263201035_neutral_10991
+1263201035_sad_52313
+1263201035_neutral_39445
+1263201035_neutral_52951
+1263201035_neutral_51007
+1263201035_surprise_51367
+1263201035_surprise_39040
+1263201035_angry_48564
+1263201035_angry_73305
+1263201035_happy_66156
+1263201035_happy_63541
+1263201035_fear_74795
+1263201035_fear_30854
+1263201035_fear_48694
+1263201035_happy_9660
+1263201035_surprise_51740
+1263201035_neutral_49658
+1263201035_neutral_17693
+1263201035_sad_47969
+1263201035_sad_30008
+1263201035_angry_31345
+1263201035_happy_15601
+1263201035_sad_66301
+1263201035_surprise_53157
+1263201035_neutral_30953
+1263201035_fear_50634
+1263201035_neutral_8449
+1263201035_neutral_15191
+1263201035_angry_32771
+1263201035_surprise_67012
+1263201035_sad_52543
+1263201035_neutral_39082
+1263201035_happy_50078
+1263201035_sad_51103
+1263201035_happy_40316
+1263201035_happy_32291
+1263201035_neutral_33192
+1263201035_sad_76012
+1263201035_sad_52759
+1263201035_sad_53548
+1263201035_fear_39715
+1263201035_sad_72851
+1263201035_angry_53743
+1263201035_happy_39837
+1263201035_neutral_63348
+1263201035_sad_74668
+1263201035_surprise_75251
+1263201035_fear_15376
+1263201035_neutral_4234
+1263201035_neutral_42270
+1263201035_happy_52805
+1263201035_happy_52704
+1263201035_sad_75964
+805570882_angry_64699
+805570882_surprise_55222
+805570882_sad_31916
+805570882_angry_36288
+805570882_sad_62932
+805570882_happy_27027
+805570882_surprise_56882
+805570882_angry_43473
+805570882_fear_41114
+805570882_neutral_40497
+805570882_neutral_29390
+805570882_happy_36822
+805570882_neutral_31819
+805570882_fear_65887
+805570882_fear_23962
+805570882_neutral_33025
+805570882_surprise_37205
+805570882_angry_29593
+805570882_sad_70731
+805570882_fear_20336
+805570882_neutral_18525
+805570882_happy_42021
+805570882_sad_41786
+805570882_happy_24961
+805570882_neutral_36905
+805570882_surprise_17791
+805570882_surprise_27648
+805570882_neutral_55697
+805570882_neutral_65787
+805570882_sad_63958
+805570882_surprise_20204
+805570882_sad_46632
+805570882_happy_47411
+805570882_angry_41859
+805570882_surprise_27006
+805570882_angry_42264
+805570882_happy_20262
+805570882_fear_44561
+805570882_fear_46593
+805570882_surprise_15909
+805570882_neutral_61258
+805570882_angry_43263
+805570882_surprise_16708
+805570882_angry_14931
+805570882_angry_55734
+805570882_happy_24919
+805570882_fear_24236
+805570882_sad_24361
+805570882_neutral_19960
+805570882_angry_65833
+805570882_fear_49107
+805570882_sad_27839
+805570882_angry_41828
+805570882_angry_27139
+805570882_fear_63159
+805570882_surprise_57740
+805570882_surprise_18601
+805570882_fear_61196
+805570882_angry_40655
+805570882_fear_19942
+805570882_sad_63223
+805570882_angry_63849
+805570882_fear_75004
+805570882_surprise_73987
+805570882_surprise_41853
+805570882_sad_46180
+805570882_happy_18233
+805570882_happy_26890
+805570882_angry_64789
+805570882_surprise_57794
+805570882_sad_27306
+805570882_fear_27730
+805570882_angry_15582
+805570882_happy_46355
+805570882_surprise_17772
+805570882_surprise_41567
+805570882_fear_14604
+805570882_sad_37932
+805570882_surprise_70881
+805570882_neutral_70911
+805570882_neutral_22809
+805570882_surprise_27161
+805570882_neutral_44714
+805570882_happy_54208
+805570882_happy_46372
+805570882_happy_27414
+805570882_neutral_61286
+805570882_neutral_57172
+805570882_surprise_61607
+805570882_surprise_18960
+805570882_happy_14985
+805570882_surprise_57706
+805570882_angry_57594
+805570882_happy_74541
+805570882_neutral_64744
+805570882_fear_15330
+805570882_surprise_46847
+805570882_surprise_37014
+805570882_happy_36985
+805570882_sad_45958
+805570882_angry_46626
+805570882_sad_64996
+805570882_sad_45903
+805570882_sad_36718
+805570882_angry_35151
+805570882_angry_56645
+805570882_fear_20375
+805570882_happy_24277
+805570882_happy_37473
+805570882_surprise_17290
+805570882_happy_18814
+805570882_sad_14339
+805570882_neutral_44810
+805570882_happy_70952
+805570882_neutral_34648
+805570882_neutral_26369
+805570882_happy_22644
+805570882_neutral_46656
+805570882_angry_14690
+805570882_happy_24477
+805570882_surprise_35688
+805570882_sad_46545
+805570882_angry_55111
+805570882_surprise_21310
+805570882_happy_43058
+805570882_sad_42251
+805570882_happy_40883
+805570882_angry_18098
+805570882_sad_41664
+805570882_happy_35949
+805570882_fear_35543
+805570882_fear_61437
+805570882_sad_15272
+805570882_sad_48170
+805570882_fear_23241
+805570882_fear_64098
+805570882_surprise_64104
+805570882_sad_41350
+805570882_neutral_65988
+805570882_neutral_74084
+805570882_neutral_17813
+805570882_happy_33131
+805570882_happy_55365
+805570882_sad_35841
+805570882_surprise_44404
+805570882_fear_34376
+805570882_fear_44685
+805570882_surprise_44820
+805570882_happy_44584
+805570882_sad_16512
+805570882_fear_34771
+805570882_surprise_61744
+805570882_neutral_48277
+805570882_angry_41572
+805570882_sad_24911
+805570882_surprise_70680
+805570882_angry_26986
+805570882_neutral_34628
+805570882_surprise_28416
+805570882_surprise_46130
+805570882_surprise_61625
+805570882_fear_36741
+805570882_surprise_37170
+805570882_angry_61095
+805570882_fear_40673
+805570882_surprise_74044
+805570882_angry_61014
+805570882_sad_27084
+805570882_neutral_27669
+805570882_fear_31903
+805570882_sad_19276
+805570882_sad_21274
+805570882_fear_41205
+805570882_fear_19325
+805570882_fear_24560
+805570882_happy_73913
+805570882_happy_55913
+805570882_neutral_27069
+805570882_fear_64901
+805570882_fear_20120
+805570882_sad_60947
+805570882_neutral_60957
+805570882_fear_16265
+805570882_fear_74123
+805570882_happy_14416
+805570882_happy_15471
+805570882_surprise_55764
+805570882_angry_20097
+805570882_happy_37767
+805570882_happy_27203
+805570882_happy_23190
+805570882_fear_18275
+805570882_neutral_46402
+805570882_fear_37313
+805570882_surprise_73888
+805570882_fear_35828
+805570882_neutral_15230
+805570882_fear_36558
+805570882_angry_38076
+805570882_surprise_48138
+805570882_happy_29586
+805570882_neutral_44205
+805570882_surprise_24405
+805570882_fear_28787
+805570882_happy_43294
+805570882_sad_40517
+805570882_neutral_58436
+805570882_neutral_37748
+805570882_sad_41625
+805570882_happy_61664
+805570882_fear_15969
+805570882_fear_29443
+805570882_angry_16602
+805570882_angry_70760
+805570882_surprise_40797
+805570882_angry_55791
+805570882_surprise_47016
+805570882_fear_26306
+805570882_surprise_31875
+805570882_surprise_15356
+805570882_happy_40592
+805570882_sad_61577
+805570882_happy_61061
+805570882_happy_64949
+805570882_angry_74038
+805570882_surprise_65733
+805570882_surprise_64997
+805570882_sad_28395
+805570882_sad_35556
+805570882_sad_24483
+805570882_neutral_49065
+805570882_happy_63099
+805570882_angry_46129
+805570882_happy_15869
+805570882_neutral_36074
+805570882_angry_24818
+805570882_angry_42984
+805570882_happy_24219
+805570882_happy_21206
+805570882_sad_18028
+805570882_surprise_54204
+805570882_sad_31902
+805570882_sad_28549
+805570882_happy_33149
+805570882_surprise_46437
+805570882_sad_18444
+805570882_happy_28690
+805570882_happy_46905
+805570882_happy_74102
+805570882_happy_26882
+805570882_surprise_74993
+805570882_angry_37837
+805570882_happy_27631
+805570882_surprise_46336
+805570882_surprise_36198
+805570882_happy_28669
+805570882_surprise_19416
+805570882_sad_63826
+805570882_angry_20442
+805570882_happy_34397
+805570882_angry_33163
+805570882_sad_27380
+805570882_angry_23856
+805570882_surprise_27461
+805570882_sad_74154
+805570882_neutral_64943
+805570882_fear_46191
+805570882_sad_74101
+805570882_neutral_70794
+805570882_angry_24173
+805570882_sad_18449
+805570882_fear_18543
+805570882_surprise_56647
+805570882_happy_74243
+805570882_angry_48497
+805570882_happy_45619
+805570882_angry_47424
+805570882_fear_63862
+805570882_sad_70827
+805570882_neutral_18169
+805570882_sad_54924
+805570882_neutral_43894
+805570882_happy_48453
+805570882_angry_36682
+805570882_fear_24568
+805570882_angry_36577
+805570882_neutral_24459
+805570882_sad_43832
+805570882_neutral_26317
+805570882_happy_36871
+805570882_angry_49074
+805570882_happy_23096
+805570882_angry_55175
+805570882_angry_73981
+805570882_fear_22862
+805570882_angry_44402
+805570882_happy_18715
+805570882_angry_42878
+805570882_happy_55801
+805570882_neutral_56565
+805570882_sad_45589
+805570882_neutral_18884
+805570882_sad_36451
+805570882_fear_15908
+805570882_angry_35058
+805570882_sad_55326
+805570882_fear_24537
+805570882_happy_37715
+805570882_fear_44362
+805570882_sad_46960
+805570882_neutral_64722
+805570882_fear_58429
+805570882_surprise_63728
+805570882_happy_74136
+805570882_angry_74383
+805570882_angry_27691
+805570882_sad_40904
+805570882_happy_43095
+805570882_surprise_27308
+805570882_surprise_22743
+805570882_surprise_74375
+805570882_angry_24263
+805570882_sad_55654
+805570882_surprise_44621
+805570882_surprise_17265
+805570882_fear_18235
+805570882_happy_61687
+805570882_happy_17122
+805570882_surprise_49091
+805570882_sad_45019
+805570882_angry_55698
+805570882_happy_38239
+805570882_angry_63750
+805570882_surprise_21282
+805570882_neutral_61308
+805570882_angry_56842
+805570882_fear_61004
+805570882_angry_64113
+805570882_surprise_40636
+805570882_surprise_64012
+805570882_sad_17165
+805570882_angry_70801
+805570882_surprise_17987
+805570882_sad_41760
+805570882_fear_26385
+805570882_surprise_47489
+805570882_neutral_42826
+805570882_sad_70797
+805570882_neutral_20461
+805570882_fear_18032
+805570882_fear_26553
+805570882_neutral_20083
+805570882_fear_18354
+805570882_sad_29457
+805570882_happy_35579
+805570882_angry_34344
+805570882_sad_43777
+805570882_sad_40970
+805570882_fear_54028
+805570882_angry_27067
+805570882_angry_44897
+805570882_surprise_46445
+805570882_sad_47428
+805570882_fear_70622
+805570882_angry_23129
+805570882_sad_17972
+805570882_fear_46616
+805570882_sad_44773
+805570882_fear_28542
+805570882_happy_48800
+805570882_angry_35127
+805570882_sad_61500
+805570882_neutral_75016
+805570882_neutral_36072
+805570882_neutral_61615
+805570882_fear_16295
+805570882_surprise_58382
+805570882_happy_34531
+805570882_surprise_19406
+805570882_surprise_36006
+805570882_fear_55294
+805570882_fear_24074
+805570882_neutral_31962
+805570882_surprise_27176
+805570882_happy_14980
+805570882_fear_63939
+805570882_fear_43260
+805570882_angry_44575
+805570882_neutral_70862
+805570882_angry_17250
+805570882_neutral_18988
+805570882_sad_61235
+805570882_neutral_27618
+805570882_surprise_38034
+805570882_fear_46208
+805570882_fear_36014
+805570882_happy_18309
+805570882_angry_15891
+805570882_neutral_16257
+805570882_happy_40582
+805570882_happy_74185
+805570882_angry_70956
+805570882_angry_20488
+805570882_happy_16480
+805570882_fear_42760
+805570882_fear_20226
+805570882_fear_74326
+805570882_angry_25008
+805570882_surprise_55020
+805570882_fear_56802
+805570882_happy_55554
+805570882_neutral_26881
+805570882_fear_46976
+805570882_surprise_40822
+805570882_sad_55763
+805570882_surprise_40481
+805570882_sad_35576
+805570882_fear_40612
+805570882_happy_37463
+805570882_surprise_57210
+805570882_sad_24488
+805570882_fear_34383
+805570882_happy_21246
+805570882_happy_20300
+805570882_angry_61639
+805570882_angry_54906
+805570882_sad_46145
+805570882_fear_28671
+805570882_happy_14569
+805570882_fear_41554
+805570882_sad_42932
+805570882_surprise_74053
+805570882_fear_35686
+805570882_fear_46787
+805570882_neutral_40669
+805570882_sad_43341
+805570882_neutral_41952
+805570882_happy_18079
+805570882_fear_54026
+805570882_neutral_65851
+805570882_angry_19997
+805570882_angry_45722
+805570882_neutral_46518
+805570882_happy_38326
+805570882_fear_64767
+805570882_happy_28428
+805570882_surprise_43871
+805570882_angry_54114
+805570882_surprise_73982
+805570882_fear_34900
+805570882_neutral_41406
+805570882_angry_18540
+805570882_fear_46728
+805570882_angry_26954
+805570882_angry_41320
+805570882_happy_20431
+805570882_neutral_63185
+805570882_happy_44966
+805570882_fear_23249
+805570882_sad_36607
+805570882_happy_55715
+805570882_neutral_38130
+805570882_surprise_36411
+805570882_surprise_17810
+805570882_neutral_14344
+805570882_happy_14687
+805570882_happy_27187
+805570882_neutral_43794
+805570882_happy_27276
+805570882_fear_48221
+805570882_fear_58438
+805570882_neutral_65788
+805570882_surprise_37650
+805570882_neutral_18744
+805570882_surprise_16349
+805570882_neutral_74410
+805570882_surprise_41676
+805570882_angry_54015
+805570882_fear_31898
+805570882_fear_19238
+805570882_surprise_21486
+805570882_angry_36096
+805570882_neutral_17049
+805570882_fear_61181
+805570882_neutral_74260
+805570882_fear_35850
+805570882_surprise_23130
+805570882_sad_27575
+805570882_fear_17982
+805570882_surprise_28426
+805570882_sad_37028
+805570882_neutral_20474
+805570882_sad_31765
+805570882_surprise_64080
+805570882_neutral_35727
+805570882_sad_40411
+805570882_surprise_63077
+805570882_happy_18196
+805570882_fear_38332
+805570882_neutral_21311
+805570882_fear_35121
+805570882_fear_61302
+805570882_neutral_46283
+805570882_surprise_26325
+805570882_neutral_75024
+805570882_sad_44153
+805570882_neutral_43290
+805570882_surprise_40796
+805570882_surprise_56677
+805570882_neutral_44278
+805570882_sad_64847
+805570882_neutral_40950
+805570882_neutral_24725
+805570882_happy_45933
+805570882_neutral_41093
+805570882_sad_74537
+805570882_fear_36291
+805570882_neutral_28684
+805570882_sad_27840
+805570882_sad_47406
+805570882_fear_18248
+805570882_angry_18475
+805570882_surprise_17186
+805570882_sad_34715
+805570882_neutral_57434
+805570882_surprise_34634
+805570882_sad_66049
+805570882_sad_47029
+805570882_sad_34696
+805570882_surprise_44367
+805570882_angry_70972
+805570882_sad_24382
+805570882_surprise_16570
+805570882_neutral_16319
+805570882_fear_74318
+805570882_sad_61412
+805570882_happy_42898
+805570882_angry_24143
+805570882_fear_45850
+805570882_sad_21236
+805570882_sad_19333
+805570882_fear_74962
+805570882_happy_44744
+805570882_fear_26294
+805570882_happy_42103
+805570882_angry_23228
+805570882_fear_54228
+805570882_surprise_19944
+805570882_neutral_31896
+805570882_happy_35561
+805570882_fear_36307
+805570882_neutral_14814
+805570882_surprise_54374
+805570882_surprise_18887
+805570882_fear_74412
+805570882_angry_29425
+805570882_sad_37525
+805570882_angry_45047
+805570882_neutral_36942
+805570882_neutral_26340
+805570882_sad_36115
+805570882_neutral_17074
+805570882_angry_16314
+805570882_fear_41781
+805570882_neutral_44648
+805570882_angry_46176
+805570882_sad_36877
+805570882_happy_28402
+805570882_fear_70908
+805570882_angry_19353
+805570882_happy_48758
+805570882_angry_24907
+805570882_happy_23983
+805570882_surprise_46574
+805570882_angry_48184
+805570882_happy_43337
+805570882_surprise_28225
+805570882_fear_58360
+805570882_neutral_70768
+805570882_angry_54359
+805570882_surprise_17886
+805570882_sad_14928
+805570882_angry_47461
+805570882_happy_57612
+805570882_fear_47400
+805570882_fear_43115
+805570882_sad_38294
+805570882_fear_37066
+805570882_sad_41239
+805570882_sad_26998
+805570882_sad_36878
+805570882_surprise_15990
+805570882_neutral_64022
+805570882_neutral_74311
+805570882_fear_40969
+805570882_sad_54073
+805570882_happy_20096
+805570882_happy_44269
+805570882_sad_16227
+805570882_happy_56811
+805570882_surprise_44739
+805570882_neutral_22838
+805570882_fear_24750
+805570882_sad_61317
+805570882_happy_18770
+805570882_fear_65079
+805570882_neutral_45632
+805570882_fear_27093
+805570882_happy_43363
+805570882_sad_18582
+805570882_angry_73941
+805570882_neutral_64987
+805570882_angry_58485
+805570882_happy_44763
+805570882_sad_35527
+805570882_neutral_17277
+805570882_angry_45954
+805570882_fear_47127
+805570882_sad_18546
+805570882_happy_20105
+805570882_neutral_43782
+805570882_neutral_27722
+805570882_sad_47028
+805570882_fear_43078
+805570882_fear_57715
+805570882_neutral_19982
+805570882_neutral_23188
+805570882_sad_19319
+805570882_surprise_34458
+805570882_fear_54038
+805570882_happy_27621
+805570882_neutral_37377
+805570882_surprise_38056
+805570882_angry_15321
+805570882_fear_54115
+805570882_fear_44867
+805570882_angry_62965
+805570882_angry_18574
+805570882_fear_26440
+805570882_angry_27808
+805570882_neutral_42058
+805570882_neutral_18811
+805570882_fear_45840
+805570882_sad_56528
+805570882_surprise_44515
+805570882_sad_28365
+805570882_happy_47439
+805570882_angry_63870
+805570882_surprise_17828
+805570882_fear_41527
+805570882_fear_14986
+805570882_fear_42079
+805570882_angry_44423
+805570882_angry_34303
+805570882_neutral_64079
+805570882_angry_15508
+805570882_neutral_22681
+805570882_angry_43393
+805570882_fear_18620
+805570882_surprise_29606
+805570882_sad_56914
+805570882_sad_22628
+805570882_angry_28322
+805570882_neutral_45780
+805570882_happy_45865
+805570882_happy_14725
+805570882_fear_19054
+805570882_sad_28682
+805570882_sad_24821
+805570882_neutral_20447
+805570882_neutral_54139
+805570882_fear_43053
+805570882_sad_45853
+805570882_surprise_42146
+805570882_angry_22837
+805570882_angry_28818
+805570882_happy_35757
+805570882_happy_17169
+805570882_angry_63191
+805570882_sad_14681
+805570882_fear_18623
+805570882_neutral_56893
+805570882_neutral_36339
+805570882_neutral_64793
+805570882_fear_61020
+805570882_fear_61760
+805570882_neutral_57374
+805570882_fear_27793
+805570882_neutral_20215
+805570882_surprise_34473
+805570882_neutral_70777
+805570882_happy_61045
+805570882_neutral_55405
+805570882_angry_14669
+805570882_fear_28673
+805570882_surprise_45736
+805570882_neutral_24169
+805570882_angry_24206
+805570882_angry_21250
+805570882_sad_23097
+805570882_neutral_41383
+805570882_neutral_46968
+805570882_fear_37989
+805570882_sad_26499
+805570882_neutral_45863
+805570882_sad_74386
+805570882_angry_61514
+805570882_surprise_61528
+805570882_fear_24521
+805570882_fear_66097
+805570882_surprise_26387
+805570882_sad_74310
+805570882_angry_43884
+805570882_neutral_47429
+805570882_angry_37887
+805570882_sad_28582
+805570882_surprise_36087
+805570882_sad_20287
+805570882_neutral_14946
+805570882_happy_18257
+805570882_happy_46488
+805570882_angry_49151
+805570882_sad_40927
+805570882_fear_43801
+805570882_angry_75113
+805570882_sad_29402
+805570882_surprise_70883
+805570882_happy_65968
+805570882_happy_35616
+805570882_neutral_35560
+805570882_surprise_46668
+805570882_fear_24301
+805570882_surprise_46642
+805570882_sad_49262
+805570882_surprise_16006
+805570882_angry_43905
+805570882_happy_46758
+805570882_neutral_18957
+805570882_sad_34427
+805570882_neutral_56527
+805570882_neutral_43849
+805570882_fear_38269
+805570882_surprise_38234
+805570882_angry_18674
+805570882_angry_24205
+805570882_angry_57557
+805570882_angry_57265
+805570882_happy_27231
+805570882_angry_48102
+805570882_surprise_48317
+805570882_angry_26432
+805570882_angry_15332
+805570882_sad_36847
+805570882_happy_61571
+805570882_neutral_21408
+805570882_surprise_43376
+805570882_neutral_34939
+805570882_fear_28328
+805570882_angry_40588
+805570882_surprise_21262
+805570882_surprise_34709
+805570882_surprise_14493
+805570882_happy_19012
+805570882_neutral_26938
+805570882_happy_35972
+805570882_happy_28844
+805570882_sad_40390
+805570882_surprise_58370
+805570882_angry_46759
+805570882_neutral_22871
+805570882_fear_16729
+805570882_happy_43948
+805570882_angry_63843
+805570882_surprise_49334
+805570882_fear_64750
+805570882_sad_37043
+805570882_fear_57443
+805570882_happy_33344
+805570882_neutral_34782
+805570882_sad_37673
+805570882_neutral_24112
+805570882_angry_43630
+805570882_angry_24436
+805570882_surprise_27318
+805570882_angry_16578
+805570882_neutral_36009
+805570882_angry_64706
+805570882_happy_20095
+805570882_sad_64918
+805570882_happy_37318
+805570882_happy_34485
+805570882_neutral_65807
+805570882_sad_44803
+805570882_angry_38351
+805570882_neutral_36490
+805570882_sad_46804
+805570882_sad_74519
+805570882_sad_46479
+805570882_surprise_20144
+805570882_surprise_40814
+805570882_fear_37260
+805570882_fear_18549
+805570882_surprise_35120
+805570882_neutral_46261
+805570882_angry_70976
+805570882_sad_46962
+805570882_angry_35605
+805570882_surprise_24890
+805570882_neutral_35021
+805570882_surprise_46660
+805570882_sad_41349
+805570882_angry_48755
+805570882_sad_27704
+805570882_angry_36791
+805570882_happy_14849
+805570882_neutral_44437
+805570882_neutral_70643
+805570882_angry_37323
+805570882_neutral_74089
+805570882_happy_45920
+805570882_surprise_27592
+805570882_sad_43514
+805570882_fear_37306
+805570882_happy_36235
+805570882_neutral_45657
+805570882_neutral_43590
+805570882_surprise_27124
+805570882_surprise_18223
+805570882_neutral_55261
+805570882_sad_70675
+805570882_angry_31729
+805570882_sad_40847
+805570882_fear_42203
+805570882_surprise_27298
+805570882_fear_23292
+805570882_fear_47447
+805570882_angry_48365
+805570882_happy_27624
+805570882_neutral_18190
+805570882_surprise_49088
+805570882_sad_44559
+805570882_happy_28226
+805570882_fear_46916
+805570882_angry_36922
+805570882_sad_62892
+805570882_surprise_47131
+805570882_fear_46547
+805570882_angry_48893
+805570882_neutral_44394
+805570882_happy_18904
+805570882_neutral_64899
+805570882_angry_19417
+805570882_angry_18698
+805570882_happy_33218
+805570882_angry_44343
+805570882_surprise_16236
+805570882_fear_17760
+805570882_happy_18576
+805570882_fear_36518
+805570882_fear_28736
+805570882_sad_45585
+805570882_fear_17969
+805570882_happy_56715
+805570882_surprise_46414
+805570882_neutral_64910
+805570882_sad_17043
+805570882_happy_23904
+805570882_sad_74539
+805570882_happy_49089
+805570882_happy_55289
+805570882_happy_74141
+805570882_happy_38212
+805570882_surprise_61185
+805570882_neutral_74079
+805570882_surprise_14911
+805570882_happy_43445
+805570882_sad_24312
+805570882_fear_42821
+805570882_fear_16134
+805570882_happy_36433
+805570882_surprise_63989
+805570882_sad_33398
+805570882_surprise_55438
+805570882_angry_55229
+805570882_surprise_36972
+805570882_fear_55610
+805570882_angry_66011
+805570882_angry_61056
+805570882_happy_54161
+805570882_angry_46856
+805570882_surprise_73883
+805570882_neutral_65086
+805570882_sad_44218
+805570882_neutral_23036
+805570882_surprise_37596
+805570882_happy_16205
+805570882_angry_55711
+805570882_sad_35891
+805570882_fear_49161
+805570882_surprise_74043
+805570882_angry_41026
+805570882_sad_21372
+805570882_angry_48428
+805570882_angry_65008
+805570882_happy_56627
+805570882_angry_56682
+805570882_surprise_41343
+805570882_neutral_35752
+805570882_angry_21416
+805570882_fear_15232
+805570882_fear_18329
+805570882_surprise_34852
+805570882_angry_75029
+805570882_surprise_37853
+805570882_happy_29492
+805570882_angry_37338
+805570882_surprise_74333
+805570882_fear_42912
+805570882_fear_23077
+805570882_fear_29652
+805570882_fear_44313
+805570882_neutral_74210
+805570882_angry_46183
+805570882_angry_34822
+805570882_surprise_41500
+805570882_sad_43356
+805570882_happy_43524
+805570882_neutral_44151
+805570882_sad_54921
+805570882_neutral_23191
+805570882_fear_26895
+805570882_happy_42836
+805570882_angry_33244
+805570882_surprise_55169
+805570882_angry_74535
+805570882_happy_65904
+805570882_happy_61098
+805570882_sad_43537
+805570882_angry_20270
+805570882_angry_29645
+805570882_fear_57474
+805570882_neutral_24535
+805570882_fear_54218
+805570882_happy_24996
+805570882_happy_48313
+805570882_surprise_44357
+805570882_surprise_45547
+805570882_neutral_44223
+805570882_sad_36924
+805570882_angry_63212
+805570882_happy_34551
+805570882_happy_24002
+805570882_fear_63804
+805570882_happy_46403
+805570882_surprise_31755
+805570882_neutral_41566
+805570882_neutral_44401
+805570882_neutral_55269
+805570882_sad_43432
+805570882_neutral_26459
+805570882_fear_15910
+805570882_sad_34699
+805570882_neutral_48805
+805570882_surprise_15003
+805570882_sad_66030
+805570882_angry_31742
+805570882_neutral_54239
+805570882_surprise_45007
+805570882_surprise_44400
+805570882_angry_31748
+805570882_neutral_14732
+805570882_neutral_29601
+805570882_happy_43634
+805570882_angry_55744
+805570882_fear_34835
+805570882_happy_24280
+805570882_fear_15881
+805570882_sad_15829
+805570882_surprise_36403
+805570882_angry_16341
+805570882_angry_70658
+805570882_neutral_57355
+805570882_neutral_23973
+805570882_angry_22996
+805570882_angry_42843
+805570882_fear_16641
+805570882_sad_54889
+805570882_surprise_57291
+805570882_surprise_28900
+805570882_happy_37635
+805570882_fear_46304
+805570882_surprise_19466
+805570882_angry_73857
+805570882_surprise_46611
+805570882_surprise_56538
+805570882_neutral_21202
+805570882_angry_27445
+805570882_happy_29613
+805570882_angry_19294
+805570882_angry_18344
+805570882_angry_35037
+805570882_surprise_36661
+805570882_happy_56537
+805570882_surprise_40461
+805570882_happy_46608
+805570882_happy_18218
+805570882_angry_34937
+805570882_sad_38101
+805570882_surprise_45796
+805570882_angry_65886
+805570882_fear_55795
+805570882_fear_57597
+805570882_happy_33126
+805570882_angry_24308
+805570882_angry_27183
+805570882_sad_37126
+805570882_sad_44661
+805570882_angry_31784
+805570882_happy_35084
+805570882_fear_61001
+805570882_sad_57482
+805570882_angry_41932
+805570882_angry_34867
+805570882_neutral_44677
+805570882_neutral_27202
+805570882_angry_33349
+805570882_angry_37831
+805570882_neutral_14872
+805570882_surprise_41884
+805570882_neutral_41604
+805570882_neutral_36220
+805570882_neutral_64840
+805570882_neutral_15484
+805570882_happy_41543
+805570882_neutral_57352
+805570882_happy_28619
+805570882_neutral_21264
+805570882_angry_37528
+805570882_surprise_36441
+805570882_fear_64032
+805570882_angry_36195
+805570882_surprise_44933
+805570882_neutral_58399
+805570882_angry_71018
+805570882_fear_44279
+805570882_neutral_17777
+805570882_sad_46527
+805570882_neutral_42989
+805570882_angry_26289
+805570882_neutral_24748
+805570882_surprise_57708
+805570882_angry_23914
+805570882_angry_31722
+805570882_sad_41586
+805570882_happy_38086
+805570882_neutral_15482
+805570882_surprise_37567
+805570882_angry_27571
+805570882_sad_55283
+805570882_angry_65859
+805570882_fear_36102
+805570882_angry_18575
+805570882_fear_57386
+805570882_surprise_23110
+805570882_happy_48919
+805570882_angry_34847
+805570882_neutral_18816
+805570882_fear_44028
+805570882_surprise_21450
+805570882_sad_63198
+805570882_sad_43168
+805570882_surprise_45029
+805570882_fear_37279
+805570882_angry_58484
+805570882_surprise_34390
+805570882_neutral_46615
+805570882_neutral_36721
+805570882_angry_21224
+805570882_surprise_38309
+805570882_happy_22725
+805570882_angry_33085
+805570882_sad_38002
+805570882_neutral_65777
+805570882_neutral_44761
+805570882_angry_16361
+805570882_fear_14419
+805570882_angry_63751
+805570882_neutral_28417
+805570882_surprise_17733
+805570882_sad_61230
+805570882_fear_14667
+805570882_fear_61475
+805570882_fear_35042
+805570882_surprise_46657
+805570882_surprise_23909
+805570882_happy_64883
+805570882_fear_48163
+805570882_sad_37067
+805570882_angry_17874
+805570882_sad_45773
+805570882_happy_37534
+805570882_surprise_46548
+805570882_fear_20357
+805570882_sad_34756
+805570882_sad_56618
+805570882_neutral_54393
+805570882_fear_41922
+805570882_neutral_24475
+805570882_neutral_17968
+805570882_angry_24430
+805570882_angry_31865
+805570882_fear_57559
+805570882_angry_73889
+805570882_surprise_48905
+805570882_surprise_42945
+805570882_neutral_23222
+805570882_surprise_14994
+805570882_surprise_35761
+805570882_angry_36555
+805570882_sad_46816
+805570882_angry_46201
+805570882_neutral_43220
+805570882_surprise_22791
+805570882_neutral_46613
+805570882_neutral_15514
+805570882_fear_40961
+805570882_sad_54391
+805570882_angry_55678
+805570882_surprise_58499
+805570882_angry_36830
+805570882_sad_63285
+805570882_sad_70919
+805570882_neutral_20281
+805570882_sad_16273
+805570882_angry_36468
+805570882_fear_36052
+805570882_sad_34726
+805570882_neutral_16286
+805570882_angry_61534
+805570882_angry_37370
+805570882_fear_54201
+805570882_surprise_34766
+805570882_surprise_14520
+805570882_neutral_19001
+805570882_neutral_41029
+805570882_fear_63216
+805570882_sad_54923
+805570882_fear_19386
+805570882_surprise_70901
+805570882_sad_31854
+805570882_neutral_46944
+805570882_neutral_56605
+805570882_surprise_24261
+805570882_surprise_43416
+805570882_happy_45947
+805570882_angry_70792
+805570882_neutral_24454
+805570882_fear_44478
+805570882_happy_63103
+805570882_sad_23234
+805570882_fear_37072
+805570882_neutral_45795
+805570882_surprise_14840
+805570882_sad_36831
+805570882_fear_19237
+805570882_neutral_17300
+805570882_angry_34850
+805570882_happy_31917
+805570882_neutral_29629
+805570882_surprise_42085
+805570882_sad_70640
+805570882_sad_37579
+805570882_angry_41944
+805570882_sad_23976
+805570882_happy_55242
+805570882_fear_14737
+805570882_happy_42941
+805570882_surprise_55054
+805570882_angry_19978
+805570882_angry_74198
+805570882_fear_40589
+805570882_neutral_70750
+805570882_happy_40868
+805570882_happy_38233
+805570882_angry_41263
+805570882_angry_14716
+805570882_angry_28329
+805570882_angry_20465
+805570882_angry_34734
+805570882_neutral_64860
+805570882_angry_63956
+805570882_happy_48241
+805570882_surprise_26994
+805570882_fear_37057
+805570882_fear_17743
+805570882_angry_24985
+805570882_fear_29556
+805570882_surprise_46957
+805570882_surprise_57833
+805570882_neutral_19305
+805570882_angry_35169
+805570882_fear_57811
+805570882_happy_26985
+805570882_fear_29427
+805570882_fear_64017
+805570882_neutral_35156
+805570882_happy_34826
+805570882_neutral_43228
+805570882_surprise_36526
+805570882_sad_27636
+805570882_neutral_61766
+805570882_surprise_16371
+805570882_neutral_26414
+805570882_neutral_37225
+805570882_sad_19864
+805570882_surprise_63820
+805570882_angry_23922
+805570882_surprise_75124
+805570882_sad_54268
+805570882_neutral_24572
+805570882_happy_37331
+805570882_neutral_27325
+805570882_neutral_61596
+805570882_sad_19371
+805570882_neutral_28440
+805570882_neutral_58398
+805570882_angry_17845
+805570882_angry_38054
+805570882_sad_64756
+805570882_angry_27606
+805570882_fear_64785
+805570882_angry_17876
+805570882_angry_40775
+399172782_happy_73593
+399172782_sad_65138
+399172782_surprise_36154
+399172782_angry_26197
+399172782_angry_26851
+399172782_surprise_69860
+399172782_fear_45465
+399172782_sad_61916
+399172782_angry_57855
+399172782_sad_64636
+399172782_neutral_19617
+399172782_surprise_23787
+399172782_happy_64346
+399172782_sad_19777
+399172782_surprise_73551
+399172782_neutral_70247
+399172782_surprise_54644
+399172782_neutral_58107
+399172782_angry_61939
+399172782_angry_64651
+399172782_fear_61898
+399172782_surprise_58296
+399172782_angry_69702
+399172782_surprise_26223
+399172782_fear_49197
+399172782_surprise_62017
+399172782_neutral_65346
+399172782_sad_25408
+399172782_angry_55097
+399172782_sad_69732
+399172782_angry_72399
+399172782_neutral_68243
+399172782_happy_20516
+399172782_fear_57709
+399172782_fear_21586
+399172782_angry_56016
+399172782_angry_26699
+399172782_fear_49581
+399172782_angry_61956
+399172782_neutral_26843
+399172782_surprise_21652
+399172782_fear_55502
+399172782_surprise_56362
+399172782_sad_61865
+399172782_angry_70319
+399172782_happy_72504
+399172782_angry_61831
+399172782_surprise_45284
+399172782_sad_69893
+399172782_neutral_57605
+399172782_fear_69365
+399172782_angry_61977
+399172782_surprise_65525
+399172782_neutral_61966
+399172782_neutral_70302
+399172782_sad_62656
+399172782_fear_23803
+399172782_neutral_69630
+399172782_sad_57222
+399172782_neutral_70094
+399172782_fear_19666
+399172782_fear_45311
+399172782_angry_49086
+399172782_surprise_45091
+399172782_angry_72356
+399172782_sad_57654
+399172782_surprise_58135
+399172782_surprise_20873
+399172782_happy_21508
+399172782_sad_70375
+399172782_fear_20849
+399172782_happy_70008
+399172782_happy_62668
+399172782_angry_62562
+399172782_fear_65254
+399172782_sad_45494
+399172782_angry_49376
+399172782_sad_61995
+399172782_fear_57961
+399172782_angry_55973
+399172782_happy_21648
+399172782_fear_45233
+399172782_angry_54654
+399172782_surprise_45298
+399172782_neutral_45106
+399172782_angry_19718
+399172782_fear_64261
+399172782_surprise_49604
+399172782_happy_54795
+399172782_sad_58128
+399172782_surprise_20700
+399172782_sad_57085
+399172782_angry_49554
+399172782_happy_55984
+399172782_neutral_72596
+399172782_surprise_69452
+399172782_neutral_54493
+399172782_angry_73534
+399172782_angry_35202
+399172782_angry_69823
+399172782_sad_69738
+399172782_happy_58136
+399172782_angry_57797
+399172782_neutral_19623
+399172782_neutral_20687
+399172782_happy_70406
+399172782_sad_55955
+399172782_surprise_49485
+399172782_sad_45117
+399172782_neutral_54738
+399172782_sad_61948
+399172782_neutral_43709
+399172782_fear_20788
+399172782_fear_62229
+399172782_happy_45297
+399172782_neutral_70567
+399172782_angry_20717
+399172782_neutral_54562
+399172782_surprise_57301
+399172782_happy_62596
+399172782_angry_65222
+399172782_neutral_58183
+399172782_happy_65511
+399172782_fear_58200
+399172782_surprise_72340
+399172782_angry_70450
+399172782_neutral_64284
+399172782_neutral_55276
+399172782_sad_65483
+399172782_surprise_45995
+399172782_happy_26805
+399172782_angry_54767
+399172782_happy_57841
+399172782_happy_65471
+399172782_surprise_26205
+399172782_fear_23989
+399172782_fear_64125
+399172782_surprise_64387
+399172782_angry_25699
+399172782_sad_56032
+399172782_fear_72685
+399172782_fear_56136
+399172782_angry_23745
+399172782_sad_19672
+399172782_neutral_43702
+399172782_happy_56984
+399172782_sad_64441
+399172782_neutral_57984
+399172782_angry_72647
+399172782_happy_69716
+399172782_happy_61798
+399172782_neutral_21611
+399172782_fear_64444
+399172782_sad_73707
+399172782_surprise_56501
+399172782_sad_70316
+399172782_neutral_54814
+399172782_angry_57019
+399172782_sad_49516
+399172782_neutral_57730
+399172782_surprise_70545
+399172782_angry_70337
+399172782_neutral_24117
+399172782_surprise_64492
+399172782_surprise_62687
+399172782_surprise_25680
+399172782_neutral_23862
+399172782_sad_23730
+399172782_angry_58262
+399172782_fear_36136
+399172782_neutral_57827
+399172782_surprise_70106
+399172782_sad_36132
+399172782_fear_48850
+399172782_sad_49523
+399172782_neutral_26841
+399172782_sad_25325
+399172782_angry_19781
+399172782_surprise_64191
+399172782_angry_19701
+399172782_neutral_49496
+399172782_surprise_65506
+399172782_sad_19749
+399172782_fear_62114
+399172782_surprise_54692
+399172782_angry_22164
+399172782_neutral_19825
+399172782_surprise_62009
+399172782_fear_64619
+399172782_surprise_26867
+399172782_fear_57983
+399172782_sad_73823
+399172782_fear_61942
+399172782_neutral_25697
+399172782_happy_57743
+399172782_angry_72544
+399172782_surprise_20674
+399172782_neutral_54770
+399172782_angry_21543
+399172782_neutral_23794
+399172782_fear_56067
+399172782_surprise_55373
+399172782_fear_70120
+399172782_surprise_57773
+399172782_fear_58091
+399172782_angry_65282
+399172782_sad_57150
+399172782_angry_25047
+399172782_fear_69676
+399172782_surprise_57687
+399172782_happy_19679
+399172782_happy_45073
+399172782_neutral_26758
+399172782_surprise_56147
+399172782_angry_70544
+399172782_angry_19757
+399172782_sad_62200
+399172782_sad_64687
+399172782_happy_57582
+399172782_neutral_73465
+399172782_sad_26216
+399172782_happy_65332
+399172782_happy_20590
+399172782_neutral_23699
+399172782_happy_54656
+399172782_sad_62267
+399172782_sad_19808
+399172782_fear_65481
+399172782_happy_65541
+399172782_angry_69551
+399172782_surprise_69638
+399172782_sad_70529
+399172782_angry_60840
+399172782_happy_64415
+399172782_happy_49477
+399172782_surprise_61858
+399172782_happy_60845
+399172782_surprise_20809
+399172782_angry_19756
+399172782_fear_73716
+399172782_sad_72252
+399172782_angry_70250
+399172782_surprise_23749
+399172782_sad_73591
+399172782_fear_64128
+399172782_angry_55360
+399172782_sad_55378
+399172782_sad_45279
+399172782_angry_68043
+399172782_surprise_20671
+399172782_sad_14234
+399172782_fear_56291
+399172782_surprise_70334
+399172782_happy_24003
+399172782_sad_35213
+399172782_surprise_54533
+399172782_happy_68115
+399172782_angry_69890
+399172782_neutral_69332
+399172782_angry_61832
+399172782_sad_26752
+399172782_fear_26731
+399172782_sad_54667
+399172782_sad_64313
+399172782_sad_54448
+399172782_angry_25358
+399172782_sad_56109
+399172782_neutral_62536
+399172782_surprise_61971
+399172782_neutral_72282
+399172782_sad_49469
+399172782_happy_65243
+399172782_sad_14264
+399172782_neutral_54621
+399172782_surprise_19836
+399172782_fear_62150
+399172782_surprise_54616
+399172782_surprise_48863
+399172782_angry_56047
+399172782_neutral_64420
+399172782_surprise_26744
+399172782_neutral_21549
+399172782_happy_60853
+399172782_angry_45156
+399172782_fear_62658
+399172782_happy_70574
+399172782_surprise_56450
+399172782_happy_69540
+399172782_surprise_61946
+399172782_fear_73547
+399172782_happy_62465
+399172782_neutral_70362
+399172782_fear_70611
+399172782_sad_57871
+399172782_neutral_62432
+399172782_angry_70569
+399172782_fear_20799
+399172782_happy_49600
+399172782_neutral_23784
+399172782_neutral_25689
+399172782_angry_56197
+399172782_sad_72379
+399172782_neutral_56364
+399172782_angry_14251
+399172782_happy_55490
+399172782_sad_70257
+399172782_fear_56282
+399172782_neutral_25062
+399172782_fear_73679
+399172782_fear_58105
+399172782_happy_62431
+399172782_happy_65329
+399172782_happy_54573
+399172782_angry_58223
+399172782_fear_56074
+399172782_neutral_72468
+399172782_fear_65432
+399172782_happy_57903
+399172782_fear_64632
+399172782_happy_19762
+399172782_happy_45098
+399172782_sad_65542
+399172782_sad_56286
+399172782_surprise_72346
+399172782_angry_23788
+399172782_angry_45490
+399172782_sad_58019
+399172782_neutral_45249
+399172782_happy_64342
+399172782_surprise_72519
+399172782_surprise_21576
+399172782_happy_54648
+399172782_fear_45092
+399172782_angry_45210
+399172782_angry_62195
+399172782_sad_25350
+399172782_neutral_68298
+399172782_angry_68016
+399172782_fear_45341
+399172782_sad_20519
+399172782_surprise_60852
+399172782_sad_57219
+399172782_surprise_45128
+399172782_surprise_62509
+399172782_happy_64396
+399172782_happy_68160
+399172782_fear_72320
+399172782_neutral_35203
+399172782_neutral_43719
+399172782_neutral_65545
+399172782_happy_62254
+399172782_happy_23816
+399172782_neutral_54657
+399172782_fear_56049
+399172782_fear_25028
+399172782_sad_64246
+399172782_angry_26682
+399172782_angry_69563
+399172782_happy_20800
+399172782_sad_70057
+399172782_surprise_19726
+399172782_happy_58143
+399172782_sad_56205
+399172782_sad_57024
+399172782_neutral_69857
+399172782_fear_56108
+399172782_happy_20555
+399172782_fear_20874
+399172782_surprise_58203
+399172782_fear_73643
+399172782_sad_57919
+399172782_angry_70563
+399172782_happy_70329
+399172782_angry_64162
+399172782_fear_70385
+399172782_neutral_61856
+399172782_surprise_65458
+399172782_angry_54709
+399172782_angry_69462
+399172782_sad_73548
+399172782_angry_64540
+399172782_happy_45309
+399172782_neutral_68215
+399172782_angry_19753
+399172782_fear_73472
+399172782_angry_57962
+399172782_angry_55458
+399172782_neutral_70528
+399172782_angry_26692
+399172782_sad_45348
+399172782_surprise_72308
+399172782_surprise_58150
+399172782_sad_64616
+399172782_fear_25035
+399172782_fear_61905
+399172782_surprise_45324
+399172782_surprise_57293
+399172782_angry_58022
+399172782_neutral_73785
+399172782_sad_57152
+399172782_happy_25023
+399172782_neutral_64157
+399172782_neutral_73526
+399172782_fear_25285
+399172782_fear_70277
+399172782_happy_69573
+399172782_happy_54471
+399172782_angry_55032
+399172782_angry_23731
+399172782_happy_70515
+399172782_fear_22110
+399172782_happy_55181
+399172782_fear_58175
+399172782_neutral_43668
+399172782_happy_45470
+399172782_fear_69814
+399172782_neutral_43690
+399172782_fear_26789
+399172782_fear_56457
+399172782_neutral_25294
+399172782_happy_49468
+399172782_surprise_68075
+399172782_surprise_61930
+399172782_surprise_26735
+399172782_angry_57071
+399172782_surprise_62039
+399172782_neutral_21633
+399172782_sad_58147
+399172782_sad_21510
+399172782_fear_72348
+399172782_angry_62032
+399172782_sad_62138
+399172782_surprise_70193
+399172782_neutral_25163
+399172782_surprise_25194
+399172782_happy_55314
+399172782_fear_65318
+399172782_angry_70615
+399172782_neutral_73586
+399172782_surprise_19791
+399172782_neutral_45528
+399172782_fear_45099
+399172782_angry_60851
+399172782_angry_62474
+399172782_happy_62258
+399172782_neutral_62214
+399172782_angry_64608
+399172782_fear_54558
+399172782_fear_69839
+399172782_happy_69915
+399172782_happy_69804
+399172782_angry_69507
+399172782_fear_61825
+399172782_neutral_56175
+399172782_fear_68100
+399172782_surprise_55510
+399172782_happy_25459
+399172782_happy_14235
+399172782_fear_61833
+399172782_neutral_62464
+399172782_sad_64354
+399172782_happy_46007
+399172782_angry_73710
+399172782_neutral_73605
+399172782_fear_19759
+399172782_angry_19761
+399172782_fear_61923
+399172782_fear_69677
+399172782_fear_20812
+399172782_angry_43732
+399172782_sad_65451
+399172782_surprise_64546
+399172782_happy_57145
+399172782_fear_26727
+399172782_angry_45545
+399172782_surprise_61852
+399172782_surprise_56123
+399172782_surprise_26742
+399172782_surprise_69557
+399172782_neutral_62411
+399172782_neutral_20567
+399172782_neutral_56266
+399172782_happy_43712
+399172782_sad_70463
+399172782_neutral_70050
+399172782_neutral_48922
+399172782_neutral_54561
+399172782_neutral_64273
+399172782_sad_45256
+399172782_neutral_21535
+399172782_neutral_20834
+399172782_neutral_72611
+399172782_happy_46022
+399172782_fear_22058
+399172782_happy_26768
+399172782_angry_60864
+399172782_neutral_60901
+399172782_angry_69363
+399172782_neutral_57108
+399172782_sad_26770
+399172782_neutral_69867
+399172782_fear_61801
+399172782_happy_62557
+399172782_sad_64168
+399172782_angry_57380
+399172782_angry_60874
+399172782_fear_56488
+399172782_happy_58011
+399172782_angry_18425
+399172782_neutral_68188
+399172782_sad_45267
+399172782_sad_64506
+399172782_happy_72711
+399172782_happy_69516
+399172782_fear_62293
+399172782_fear_69979

filelists/all_spks/feats.ark ADDED Viewed

Binary file (449 kB). View file

filelists/all_spks/feats.scp ADDED Viewed

	@@ -0,0 +1,3 @@

+utt1 /Users/Desktop/code/GradTTS-emo/filelists/example/feats.ark:5
+utt2 /Users/Desktop/code/GradTTS-emo/filelists/example/feats.ark:78745
+utt3 /Users/Desktop/code/GradTTS-emo/filelists/example/feats.ark:370605

filelists/all_spks/text ADDED Viewed

The diff for this file is too large to render. See raw diff

filelists/all_spks/train_utts.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

filelists/all_spks/utt2emo.json ADDED Viewed

The diff for this file is too large to render. See raw diff

filelists/all_spks/utt2spk.json ADDED Viewed

The diff for this file is too large to render. See raw diff

filelists/inference_generated.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Августың аяқ жағына мүсінші тәңірия Венераның баласы Амур бейнесін орналастырған.\|0\|0
2	+ Қарғыс айтқалы жатыр ғой, өз балаларына!– десіп үркіп үн салды.\|1\|1

g_01720000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa0a76a49573909b968708717138849d68d9627cdf03307d00c7bd49278dc573
+size 55824433

grad_uncond.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:228736e3807fcd84030ec03daf1124094e6388e7a08ccabd87354d93a2e7fe0c
+size 69784515

grad_uncond_10k_conf.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbc5ffac171e269fd08bb0313022cb2c3c35e9a9cc4620ef4992fd79cf6e61a8
+size 69787322

grad_uncond_cnn_001.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0bc2ba56e0be17fa5d95c55b1c9bedfcd45a45edf865d1cb3f30a1405e645f67
+size 69787003

inference_EMA.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import argparse
+import json
+import datetime as dt
+import numpy as np
+from scipy.io.wavfile import write
+import IPython.display as ipd
+import glob
+import torch
+from pydub import AudioSegment
+from torch.utils.data import DataLoader
+from text import text_to_sequence, cmudict
+from text.symbols import symbols
+import utils_data
+import re
+from num2words import num2words
+from kaldiio import WriteHelper
+import os
+from tqdm import tqdm
+from text import text_to_sequence, convert_text
+from model import GradTTSWithEmo
+import utils_data as utils
+from attrdict import AttrDict
+from models import Generator as HiFiGAN
+HIFIGAN_CONFIG = './configs/hifigan-config.json'
+HIFIGAN_CHECKPT = './checkpts/hifigan.pt'
+if __name__ == '__main__':
+    hps, args = utils.get_hparams_decode()
+    device = torch.device('cpu' if not torch.cuda.is_available() else "cuda")
+    ckpt = utils_data.latest_checkpoint_path(hps.model_dir, "EMA_grad_*.pt")
+    print(ckpt)
+    model = GradTTSWithEmo(**hps.model).to(device)
+    logger = utils_data.get_logger(hps.model_dir, "inference.log")
+    utils_data.load_checkpoint(ckpt, model, None)
+    _ = model.cuda().eval()
+    print('Initializing HiFi-GAN...')
+    with open(HIFIGAN_CONFIG) as f:
+        h = AttrDict(json.load(f))
+    vocoder = HiFiGAN(h)
+    vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator'])
+    _ = vocoder.cuda().eval()
+    vocoder.remove_weight_norm()
+    emos = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"])
+    speakers = ['M1', 'F1', 'M2']
+    with open(args.file, 'r', encoding='utf-8') as f:
+        texts = [line.strip() for line in f.readlines()]
+    replace_nums = []
+    for i in texts:
+        replace_nums.append(i.split('|', 1))
+    nums2word = [re.sub('(\d+)', lambda m: num2words(m.group(), lang='kz'), sentence) for sentence in np.array(replace_nums)[:, 0]]
+    # Speakers id.
+    # M1 = 0
+    # F1 = 1
+    # M2 = 2
+    text2speech = []
+    for i, j in zip(nums2word, np.array(replace_nums)[:, 1]):
+        text2speech.append(f'{i}|{j}')
+    for i, line in enumerate(text2speech):
+        emo_i = int(line.split('|')[1])
+        control_spk_id = int(line.split('|')[2])
+        control_emo_id = emos.index(emos[emo_i])
+        text = line.split('|')[0]
+        with torch.no_grad():
+            ### define emotion
+            emo = torch.LongTensor([control_emo_id]).to(device)
+            sid = torch.LongTensor([control_spk_id]).to(device)
+            text_padded, text_len = convert_text(text)
+            y_enc, y_dec, attn = model.forward(text_padded, text_len,
+                                        n_timesteps=args.timesteps,
+                                        temperature=args.noise,
+                                        stoc=args.stoc, spk=sid,emo=emo, length_scale=1.,
+                                        classifier_free_guidance=args.guidance)
+        res = y_dec.squeeze().cpu().numpy()
+        x = torch.from_numpy(res).cuda().unsqueeze(0)
+        y_g_hat = vocoder(x)
+        audio = y_g_hat.squeeze()
+        audio = audio * 32768.0
+        audio = audio.detach().cpu().numpy().astype('int16')
+        audio = AudioSegment(audio.data, frame_rate=22050, sample_width=2, channels=1)
+        audio.export(f'{args.generated_path}/{emos[emo_i]}_{speakers[int(line.split("|")[2])]}.wav', format="wav")

inference_intensity_control.ipynb ADDED Viewed

File without changes

melspec.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+import torchaudio
+import librosa
+mel_basis = {}
+hann_window = {}
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa.filters.mel(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True)
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec.numpy()

model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+ from .tts import GradTTSWithEmo, GradTTSXvector

model/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (240 Bytes). View file

model/__pycache__/tts.cpython-39.pyc ADDED Viewed

Binary file (16.7 kB). View file

model/base.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import numpy as np
+import torch
+class BaseModule(torch.nn.Module):
+    def __init__(self):
+        super(BaseModule, self).__init__()
+    @property
+    def nparams(self):
+        """
+        Returns number of trainable parameters of the module.
+        """
+        num_params = 0
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                num_params += np.prod(param.detach().cpu().numpy().shape)
+        return num_params
+    def relocate_input(self, x: list):
+        """
+        Relocates provided tensors to the same device set for the module.
+        """
+        device = next(self.parameters()).device
+        for i in range(len(x)):
+            if isinstance(x[i], torch.Tensor) and x[i].device != device:
+                x[i] = x[i].to(device)
+        return x

model/classifier.py ADDED Viewed

	@@ -0,0 +1,690 @@

+import torch
+import torch.nn as nn
+from torch import Tensor, BoolTensor
+from typing import Optional, Tuple, Iterable
+from model.diffusion import SinusoidalPosEmb
+from torch.nn.functional import pad
+import math
+def silu(input):
+    '''
+    Applies the Sigmoid Linear Unit (SiLU) function element-wise:
+        SiLU(x) = x * sigmoid(x)
+    '''
+    return input * torch.sigmoid(input) # use torch.sigmoid to make sure that we created the most efficient implemetation based on builtin PyTorch functions
+class RelPositionMultiHeadedAttention(nn.Module):
+    """Multi-Head Self-Attention layer with relative position encoding.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head: The number of heads.
+        d: The number of features.
+        dropout: Dropout rate.
+        zero_triu: Whether to zero the upper triangular part of attention matrix.
+    """
+    def __init__(
+            self, d: int, n_head: int, dropout: float
+    ):
+        super().__init__()
+        assert d % n_head == 0
+        self.c = d // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(d, d)
+        self.linear_k = nn.Linear(d, d)
+        self.linear_v = nn.Linear(d, d)
+        self.linear_out = nn.Linear(d, d)
+        self.p_attn = None
+        self.dropout = nn.Dropout(p=dropout)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(d, d, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.u = nn.Parameter(torch.Tensor(self.h, self.c))
+        self.v = nn.Parameter(torch.Tensor(self.h, self.c))
+        # [H, C]
+        torch.nn.init.xavier_uniform_(self.u)
+        torch.nn.init.xavier_uniform_(self.v)
+    def forward_qkv(self, query, key, value) -> Tuple[Tensor, ...]:
+        """Transform query, key and value.
+        Args:
+            query (Tensor): [B, S, D].
+            key (Tensor): [B, T, D].
+            value (Tensor): [B, T, D].
+        Returns:
+            q (Tensor): [B, H, S, C].
+            k (Tensor): [B, H, T, C].
+            v (Tensor): [B, H, T, C].
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.c)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.c)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.c)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        return q, k, v
+    def forward_attention(self, v, scores, mask, causal=False) -> Tensor:
+        """Compute attention context vector.
+        Args:
+            v (Tensor): [B, H, T, C].
+            scores (Tensor): [B, H, S, T].
+            mask (BoolTensor): [B, T], True values are masked from scores.
+        Returns:
+            result (Tensor): [B, S, D]. Attention result weighted by the score.
+        """
+        n_batch, H, S, T = scores.shape
+        if mask is not None:
+            scores = scores.masked_fill(
+                mask.unsqueeze(1).unsqueeze(2).to(bool),
+                float("-inf"),  # [B, H, S, T]
+            )
+        if causal:
+            k_grid = torch.arange(0, S, dtype=torch.int32, device=scores.device)
+            v_grid = torch.arange(0, T, dtype=torch.int32, device=scores.device)
+            kk, vv = torch.meshgrid(k_grid, v_grid, indexing="ij")
+            causal_mask = vv > kk
+            scores = scores.masked_fill(
+                causal_mask.view(1, 1, S, T), float("-inf")
+            )
+        p_attn = self.p_attn = torch.softmax(scores, dim=-1)  # [B, H, S, T]
+        p_attn = self.dropout(p_attn)  # [B, H, S, T]
+        x = torch.matmul(p_attn, v)  # [B, H, S, C]
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.c)
+        )  # [B, S, D]
+        return self.linear_out(x)  # [B, S, D]
+    def rel_shift(self, x):
+        """Converting (..., i, i - j) matrix into (..., i, j) matrix.
+        Args:
+            x (Tensor): [B, H, S, 2S-1].
+        Returns:
+            x (Tensor): [B, H, S, S].
+        Example: Take S = 2 for example, larger values work similarly.
+        x = [
+            [(0, -1), (0, 0), (0, 1)],
+            [(1, 0),  (1, 1), (1, 2)]
+        ]
+        x_padded = [
+            [(x, x), (0, -1), (0, 0), (0, 1)],
+            [(x, x), (1, 0),  (1, 1), (1, 2)]]
+        ]
+        x_padded = [
+            [(x, x), (0, -1)],
+            [(0, 0), (0, 1)],
+            [(x, x), (1, 0)],
+            [(1, 1), (1, 2)]
+        ]
+        x = [
+            [(0, 0), (0, 1)],
+            [(1, 0), (1, 1)]
+        ]
+        """
+        B, H, S, _ = x.shape
+        zero_pad = torch.zeros((B, H, S, 1), device=x.device, dtype=x.dtype)
+        # [B, H, S, 1]
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        # [B, H, S, 2S]
+        x_padded = x_padded.view(B, H, 2 * S, S)
+        # [B, H, 2S, S]
+        x = x_padded[:, :, 1:].view_as(x)[:, :, :, :S]
+        # only keep the positions from 0 to S
+        # [B, H, 2S-1, S] <view> [B, H, S, 2S - 1] <truncate in dim -1> [B, H, S, S]
+        return x
+    def forward(
+            self, query, key, value, pos_emb, mask=None, causal=False):
+        """Compute self-attention with relative positional embedding.
+        Args:
+            query (Tensor): [B, S, D].
+            key (Tensor): [B, S, D].
+            value (Tensor): [B, S, D].
+            pos_emb (Tensor): [1/B, 2S-1, D]. Positional embedding.
+            mask (BoolTensor): [B, S], True for masked.
+            causal (bool): True for applying causal mask.
+        Returns:
+            output (Tensor): [B, S, D].
+        """
+        # Splitting Q, K, V:
+        q, k, v = self.forward_qkv(query, key, value)
+        # [B, H, S, C], [B, H, S, C], [B, H, S, C]
+        # Adding per head & channel biases to the query vectors:
+        q_u = q + self.u.unsqueeze(1)
+        q_v = q + self.v.unsqueeze(1)
+        # [B, H, S, C]
+        # Splitting relative positional coding:
+        n_batch_pos = pos_emb.size(0)  # [1/B, 2S-1, D]
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.c)
+        # [1/B, 2S-1, H, C]
+        p = p.transpose(1, 2)  # [1/B, H, 2S-1, C].
+        # Compute query, key similarity:
+        matrix_ac = torch.matmul(q_u, k.transpose(-2, -1))
+        # [B, H, S, C] x [B, H, C, S] -> [B, H, S, S]
+        matrix_bd = torch.matmul(q_v, p.transpose(-2, -1))
+        # [B, H, S, C] x [1/B, H, C, 2S-1] -> [B, H, S, 2S-1]
+        matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.c)
+        # [B, H, S, S]
+        return self.forward_attention(v, scores, mask, causal)  # [B, S, D]
+class ConditionalBiasScale(nn.Module):
+    def __init__(self, channels: int, cond_channels: int):
+        super().__init__()
+        self.scale_transform = nn.Linear(
+            cond_channels, channels, bias=True
+        )
+        self.bias_transform = nn.Linear(
+            cond_channels, channels, bias=True
+        )
+        self.init_parameters()
+    def init_parameters(self):
+        torch.nn.init.constant_(self.scale_transform.weight, 0.0)
+        torch.nn.init.constant_(self.scale_transform.bias, 1.0)
+        torch.nn.init.constant_(self.bias_transform.weight, 0.0)
+        torch.nn.init.constant_(self.bias_transform.bias, 0.0)
+    def forward(self, x: Tensor, cond: Tensor) -> Tensor:
+        """Applying conditional bias and scale.
+        Args:
+            x (Tensor): [..., channels].
+            cond (Tensor): [..., cond_channels].
+        Returns:
+            y (Tensor): [..., channels].
+        """
+        a = self.scale_transform.forward(cond)
+        b = self.bias_transform.forward(cond)
+        return x * a + b
+class FeedForwardModule(torch.nn.Module):
+    """Positionwise feed forward layer used in conformer"""
+    def __init__(
+            self, d_in: int, d_hidden: int,
+            dropout: float, bias: bool = True, d_cond: int = 0
+    ):
+        """
+        Args:
+            d_in (int): Input feature dimension.
+            d_hidden (int): Hidden unit dimension.
+            dropout (float): dropout value for first Linear Layer.
+            bias (bool): If linear layers should have bias.
+            d_cond (int, optional): The channels of conditional tensor.
+        """
+        super(FeedForwardModule, self).__init__()
+        self.layer_norm = torch.nn.LayerNorm(d_in)
+        if d_cond > 0:
+            self.cond_layer = ConditionalBiasScale(d_in, d_cond)
+        self.w_1 = torch.nn.Linear(d_in, d_hidden, bias=bias)
+        self.w_2 = torch.nn.Linear(d_hidden, d_in, bias=bias)
+        self.dropout = torch.nn.Dropout(dropout)
+    def forward(self, x: Tensor, cond: Optional[Tensor] = None) -> Tensor:
+        """
+        Args:
+            x (Tensor): [..., D].
+        Returns:
+            y (Tensor): [..., D].
+            cond (Tensor): [..., D_cond]
+        """
+        x = self.layer_norm(x)
+        if cond is not None:
+            x = self.cond_layer.forward(x, cond)
+        x = self.w_1(x)
+        x = silu(x)
+        x = self.dropout(x)
+        x = self.w_2(x)
+        return self.dropout(x)
+class RelPositionalEncoding(nn.Module):
+    """Relative positional encoding cache.
+    Args:
+        d_model: Embedding dimension.
+        dropout_rate: Dropout rate.
+        max_len: Default maximum input length.
+    """
+    def __init__(self, max_len: int, d_model: int):
+        super().__init__()
+        self.d_model = d_model
+        self.cached_code = None
+        self.l = 0
+        self.gen_code(torch.tensor(0.0).expand(1, max_len))
+    def gen_code(self, x: Tensor):
+        """Generate positional encoding with a reference tensor x.
+        Args:
+            x (Tensor): [B, L, ...], we extract the device, length, and dtype from it.
+        Effects:
+            self.cached_code (Tensor): [1, >=(2L-1), D].
+        """
+        l = x.size(1)
+        if self.l >= l:
+            if self.cached_code.dtype != x.dtype or self.cached_code.device != x.device:
+                self.cached_code = self.cached_code.to(dtype=x.dtype, device=x.device)
+            return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        code_pos = torch.zeros(l, self.d_model)  # [L, D]
+        code_neg = torch.zeros(l, self.d_model)  # [L, D]
+        pos = torch.arange(0, l, dtype=torch.float32).unsqueeze(1)  # [L, 1]
+        decay = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )  # [D // 2]
+        code_pos[:, 0::2] = torch.sin(pos * decay)
+        code_pos[:, 1::2] = torch.cos(pos * decay)
+        code_neg[:, 0::2] = torch.sin(-1 * pos * decay)
+        code_neg[:, 1::2] = torch.cos(-1 * pos * decay)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        code_pos = torch.flip(code_pos, [0]).unsqueeze(0)  # [1, L, D]
+        code_neg = code_neg[1:].unsqueeze(0)  # [1, L - 1, D]
+        code = torch.cat([code_pos, code_neg], dim=1)  # [1, 2L - 1, D]
+        self.cached_code = code.to(device=x.device, dtype=x.dtype)
+        self.l = l
+    def forward(self, x: Tensor) -> Tensor:
+        """Get positional encoding of appropriate shape given a reference Tensor.
+        Args:
+            x (Tensor): [B, L, ...].
+        Returns:
+            y (Tensor): [1, 2L-1, D].
+        """
+        self.gen_code(x)
+        l = x.size(1)
+        pos_emb = self.cached_code[
+                  :, self.l - l: self.l + l - 1,
+                  ]
+        return pos_emb
+class ConformerBlock(torch.nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100."""
+    def __init__(
+            self, d: int, d_hidden: int,
+            attention_heads: int, dropout: float,
+            depthwise_conv_kernel_size: int = 7,
+            causal: bool = False, d_cond: int = 0
+    ):
+        """
+        Args:
+            d (int): Block input output channel number.
+            d_hidden (int): FFN layer dimension.
+            attention_heads (int): Number of attention heads.
+            dropout (float): dropout value.
+            depthwise_conv_kernel_size (int): Size of kernel in depthwise conv.
+            d_cond (int, optional): The channels of conditional tensor.
+        """
+        super(ConformerBlock, self).__init__()
+        self.causal = causal
+        self.ffn1 = FeedForwardModule(
+            d, d_hidden, dropout, bias=True, d_cond=d_cond
+        )
+        self.self_attn_layer_norm = torch.nn.LayerNorm(d)
+        if d_cond > 0:
+            self.cond_layer = ConditionalBiasScale(d, d_cond)
+        self.self_attn = RelPositionMultiHeadedAttention(
+            d, attention_heads, dropout=dropout
+        )
+        self.self_attn_dropout = torch.nn.Dropout(dropout)
+        self.conv_module = ConvolutionModule(
+            d_in=d, d_hidden=d,
+            depthwise_kernel_size=depthwise_conv_kernel_size,
+            dropout=dropout, d_cond=d_cond
+        )
+        self.ffn2 = FeedForwardModule(
+            d, d_hidden, dropout, bias=True, d_cond=d_cond
+        )
+        self.final_layer_norm = torch.nn.LayerNorm(d)
+    def forward(
+            self, x: Tensor, mask: BoolTensor, pos_emb: Tensor,
+            cond: Optional[Tensor] = None
+    ) -> Tensor:
+        """
+        Args:
+            x (Tensor): [B, T, D_in].
+            mask (BoolTensor): [B, T], True for masked.
+            pos_emb (Tensor): [1 or B, 2T-1, D].
+            cond (Tensor, optional): [B, ?, D_cond].
+        Returns:
+            y (Tensor): [B, T, D_in].
+        """
+        y = x
+        x = self.ffn1(x) * 0.5 + y
+        y = x
+        # [B, T, D_in]
+        x = self.self_attn_layer_norm(x)
+        if cond is not None:
+            x = self.cond_layer.forward(x, cond)
+        x = self.self_attn.forward(
+            query=x, key=x, value=x,
+            pos_emb=pos_emb,
+            mask=mask, causal=self.causal
+        )
+        x = self.self_attn_dropout(x) + y
+        y = x
+        # [B, T, D_in]
+        x = self.conv_module.forward(x, mask) + y
+        y = x
+        # [B, T, D_in]
+        x = self.ffn2(x) * 0.5 + y
+        x = self.final_layer_norm(x)
+        x.masked_fill(mask.unsqueeze(-1), 0.0)
+        return x
+class ConvolutionModule(torch.nn.Module):
+    """Convolution Block inside a Conformer Block."""
+    def __init__(
+            self, d_in: int, d_hidden: int,
+            depthwise_kernel_size: int,
+            dropout: float, bias: bool = False,
+            causal: bool = False, d_cond: int = 0
+    ):
+        """
+        Args:
+            d_in (int): Embedding dimension.
+            d_hidden (int): Number of channels in depthwise conv layers.
+            depthwise_kernel_size (int): Depthwise conv layer kernel size.
+            dropout (float): dropout value.
+            bias (bool): If bias should be added to conv layers.
+            conditional (bool): Whether to use conditional LayerNorm.
+        """
+        super(ConvolutionModule, self).__init__()
+        assert (depthwise_kernel_size - 1) % 2 == 0, "kernel_size should be odd"
+        self.causal = causal
+        self.causal_padding = (depthwise_kernel_size - 1, 0)
+        self.layer_norm = torch.nn.LayerNorm(d_in)
+        # Optional conditional LayerNorm:
+        self.d_cond = d_cond
+        if d_cond > 0:
+            self.cond_layer = ConditionalBiasScale(d_in, d_cond)
+        self.pointwise_conv1 = torch.nn.Conv1d(
+            d_in, 2 * d_hidden,
+            kernel_size=1,
+            stride=1, padding=0,
+            bias=bias
+        )
+        self.glu = torch.nn.GLU(dim=1)
+        self.depthwise_conv = torch.nn.Conv1d(
+            d_hidden, d_hidden,
+            kernel_size=depthwise_kernel_size,
+            stride=1,
+            padding=(depthwise_kernel_size - 1) // 2 if not causal else 0,
+            groups=d_hidden, bias=bias
+        )
+        self.pointwise_conv2 = torch.nn.Conv1d(
+            d_hidden, d_in,
+            kernel_size=1,
+            stride=1, padding=0,
+            bias=bias,
+        )
+        self.dropout = torch.nn.Dropout(dropout)
+    def forward(self, x: Tensor, mask: BoolTensor, cond: Optional[Tensor] = None) -> Tensor:
+        """
+        Args:
+            x (Tensor): [B, T, D_in].
+            mask (BoolTensor): [B, T], True for masked.
+            cond (Tensor): [B, T, D_cond].
+        Returns:
+            y (Tensor): [B, T, D_in].
+        """
+        x = self.layer_norm(x)
+        if cond is not None:
+            x = self.cond_layer.forward(x, cond)
+        x = x.transpose(-1, -2)  # [B, D_in, T]
+        x = self.pointwise_conv1(x)  # [B, 2C, T]
+        x = self.glu(x)  # [B, C, T]
+        # Take care of masking the input tensor:
+        if mask is not None:
+            x = x.masked_fill(mask.unsqueeze(1), 0.0)
+        # 1D Depthwise Conv
+        if self.causal:  # Causal padding
+            x = pad(x, self.causal_padding)
+        x = self.depthwise_conv(x)
+        # FIXME: BatchNorm should not be used in variable length training.
+        x = silu(x)  # [B, C, T]
+        if mask is not None:
+            x = x.masked_fill(mask.unsqueeze(1), 0.0)
+        x = self.pointwise_conv2(x)
+        x = self.dropout(x)
+        return x.transpose(-1, -2)  # [B, T, D_in]
+class Conformer(torch.nn.Module):
+    def __init__(
+            self,
+            d: int,
+            d_hidden: int,
+            n_heads: int,
+            n_layers: int,
+            dropout: float,
+            depthwise_conv_kernel_size: int,
+            causal: bool = False,
+            d_cond: int = 0
+    ):
+        super().__init__()
+        self.pos_encoding = RelPositionalEncoding(1024, d)
+        self.causal = causal
+        self.blocks = torch.nn.ModuleList(
+            [
+                ConformerBlock(
+                    d=d,
+                    d_hidden=d_hidden,
+                    attention_heads=n_heads,
+                    dropout=dropout,
+                    depthwise_conv_kernel_size=depthwise_conv_kernel_size,
+                    causal=causal,
+                    d_cond=d_cond
+                )
+                for _ in range(n_layers)
+            ]
+        )  # type: Iterable[ConformerBlock]
+    def forward(
+            self, x: Tensor, mask: BoolTensor, cond: Tensor = None
+    ) -> Tensor:
+        """Conformer forwarding.
+        Args:
+            x (Tensor): [B, T, D].
+            mask (BoolTensor): [B, T], with True for masked.
+            cond (Tensor, optional): [B, T, D_cond].
+        Returns:
+            y (Tensor): [B, T, D]
+        """
+        pos_emb = self.pos_encoding(x)  # [1, 2T-1, D]
+        for block in self.blocks:
+            x = block.forward(x, mask, pos_emb, cond)
+        return x
+class CNNBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout, cond_dim, kernel_size, stride):
+        super(CNNBlock, self).__init__()
+        self.layers = nn.Sequential(
+            nn.Conv1d(in_dim, out_dim, kernel_size, stride),
+            nn.ReLU(),
+            nn.BatchNorm1d(out_dim,),
+            nn.Dropout(p=dropout)
+        )
+    def forward(self, inp):
+        out = self.layers(inp)
+        return out
+class CNNClassifier(nn.Module):
+    def __init__(self, in_dim, d_decoder, decoder_dropout, cond_dim):
+        super(CNNClassifier, self).__init__()
+        self.cnn = nn.Sequential(
+            CNNBlock(in_dim, d_decoder, decoder_dropout, cond_dim, 8, 4),
+            CNNBlock(d_decoder, d_decoder, decoder_dropout, cond_dim, 8, 4),
+            CNNBlock(d_decoder, d_decoder, decoder_dropout, cond_dim, 4, 2),
+            CNNBlock(d_decoder, d_decoder, decoder_dropout, cond_dim, 4, 2),
+        )  # receptive field is 180, frame shift is 64
+        self.cond_layer = nn.Sequential(
+            nn.Linear(cond_dim, in_dim),
+            nn.LeakyReLU(),
+            nn.Linear(in_dim, in_dim)
+        )
+    def forward(self, inp, mask, cond):
+        inp = inp.transpose(-1, -2)
+        cond = cond.transpose(-1, -2)
+        inp.masked_fill_(mask.unsqueeze(1), 0.0)
+        cond = self.cond_layer(cond.transpose(-1, -2)).transpose(-1, -2)
+        cond.masked_fill_(mask.unsqueeze(1), 0.0)
+        inp = inp + cond
+        return self.cnn(inp)
+class CNNClassifierWithTime(nn.Module):
+    def __init__(self, in_dim, d_decoder, decoder_dropout, cond_dim, time_emb_dim=512):
+        super(CNNClassifierWithTime, self).__init__()
+        self.cnn = nn.Sequential(
+            CNNBlock(in_dim, d_decoder, decoder_dropout, cond_dim, 8, 4),
+            CNNBlock(d_decoder, d_decoder, decoder_dropout, cond_dim, 8, 4),
+            CNNBlock(d_decoder, d_decoder, decoder_dropout, cond_dim, 4, 2),
+            CNNBlock(d_decoder, d_decoder, decoder_dropout, cond_dim, 4, 2),
+        )  # receptive field is 180, frame shift is 64
+        self.cond_layer = nn.Sequential(
+            nn.Linear(cond_dim, in_dim),
+            nn.LeakyReLU(),
+            nn.Linear(in_dim, in_dim)
+        )
+        self.time_emb = SinusoidalPosEmb(time_emb_dim)
+        self.time_layer = nn.Sequential(
+            nn.Linear(time_emb_dim, in_dim),
+            nn.LeakyReLU(),
+            nn.Linear(in_dim, in_dim)
+        )
+    def forward(self, inp, mask, cond, t):
+        time_emb = self.time_emb(t)  # [B, T]
+        time_emb = self.time_layer(time_emb.unsqueeze(1)).transpose(-1, -2)
+        inp = inp.transpose(-1, -2)
+        cond = cond.transpose(-1, -2)
+        inp.masked_fill_(mask.unsqueeze(1), 0.0)
+        cond = self.cond_layer(cond.transpose(-1, -2)).transpose(-1, -2)
+        cond.masked_fill_(mask.unsqueeze(1), 0.0)
+        inp = inp + cond + time_emb
+        return self.cnn(inp)
+class SpecClassifier(nn.Module):
+    def __init__(self, in_dim, d_decoder, h_decoder,
+                 l_decoder, decoder_dropout,
+                 k_decoder, n_class, cond_dim, model_type='conformer'):
+        super(SpecClassifier, self).__init__()
+        self.model_type = model_type
+        self.prenet = nn.Sequential(
+            nn.Linear(in_features=in_dim, out_features=d_decoder)
+        )
+        if model_type == 'conformer':
+            self.conformer = Conformer(d=d_decoder, d_hidden=d_decoder, n_heads=h_decoder,
+                                       n_layers=l_decoder, dropout=decoder_dropout,
+                                       depthwise_conv_kernel_size=k_decoder, d_cond=cond_dim)
+        elif model_type == 'CNN':
+            self.conformer = CNNClassifier(in_dim=d_decoder, d_decoder=d_decoder,
+                                           decoder_dropout=decoder_dropout, cond_dim=cond_dim)
+        elif model_type == 'CNN-with-time':
+            self.conformer = CNNClassifierWithTime(in_dim=d_decoder, d_decoder=d_decoder,
+                                                   decoder_dropout=decoder_dropout, cond_dim=cond_dim, time_emb_dim=256)
+        self.classifier = nn.Linear(d_decoder, n_class)
+    def forward(self, noisy_mel, condition, mask, **kwargs):
+        """
+        Args:
+            noisy_mel: [B, T, D]
+            condition: [B, T, D]
+            mask: [B, T] with True for un-masked (real-values)
+        Returns:
+            classification logits (un-softmaxed)
+        """
+        # print(noisy_mel.shape)
+        noisy_mel = noisy_mel.masked_fill(~mask.unsqueeze(-1), 0.0)
+        # print(self.prenet, noisy_mel.shape)
+        hiddens = self.prenet(noisy_mel)
+        if self.model_type == 'CNN-with-time':
+            hiddens = self.conformer.forward(hiddens, ~mask, condition, kwargs['t'])
+        else:
+            hiddens = self.conformer.forward(hiddens, ~mask, condition)  # [B, T, D]
+        if self.model_type == 'conformer':
+            averaged_hiddens = torch.mean(hiddens, dim=1)  # [B, D]
+            logits = self.classifier(averaged_hiddens)
+            return logits
+        elif self.model_type == 'CNN' or self.model_type == 'CNN-with-time':
+            hiddens = hiddens.transpose(-1, -2)
+            return self.classifier(hiddens)  # [B, T', C]
+    @property
+    def nparams(self):
+        return sum([p.numel() for p in self.parameters()])

model/diffusion.py ADDED Viewed

	@@ -0,0 +1,513 @@

+import math
+import torch
+from einops import rearrange
+from model.base import BaseModule
+class Mish(BaseModule):
+    def forward(self, x):
+        return x * torch.tanh(torch.nn.functional.softplus(x))
+class Upsample(BaseModule):
+    def __init__(self, dim):
+        super(Upsample, self).__init__()
+        self.conv = torch.nn.ConvTranspose2d(dim, dim, 4, 2, 1)
+    def forward(self, x):
+        return self.conv(x)
+class Downsample(BaseModule):
+    def __init__(self, dim):
+        super(Downsample, self).__init__()
+        self.conv = torch.nn.Conv2d(dim, dim, 3, 2, 1)  # kernel=3, stride=2, padding=1.
+    def forward(self, x):
+        return self.conv(x)
+class Rezero(BaseModule):
+    def __init__(self, fn):
+        super(Rezero, self).__init__()
+        self.fn = fn
+        self.g = torch.nn.Parameter(torch.zeros(1))
+    def forward(self, x):
+        return self.fn(x) * self.g
+class Block(BaseModule):
+    def __init__(self, dim, dim_out, groups=8):
+        super(Block, self).__init__()
+        self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim_out, 3,
+                                         padding=1), torch.nn.GroupNorm(
+                                         groups, dim_out), Mish())
+    def forward(self, x, mask):
+        output = self.block(x * mask)
+        return output * mask
+class ResnetBlock(BaseModule):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super(ResnetBlock, self).__init__()
+        self.mlp = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim,
+                                                               dim_out))
+        self.block1 = Block(dim, dim_out, groups=groups)
+        self.block2 = Block(dim_out, dim_out, groups=groups)
+        if dim != dim_out:
+            self.res_conv = torch.nn.Conv2d(dim, dim_out, 1)
+        else:
+            self.res_conv = torch.nn.Identity()
+    def forward(self, x, mask, time_emb):
+        h = self.block1(x, mask)
+        h += self.mlp(time_emb).unsqueeze(-1).unsqueeze(-1)
+        h = self.block2(h, mask)
+        output = h + self.res_conv(x * mask)
+        return output
+class LinearAttention(BaseModule):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super(LinearAttention, self).__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = torch.nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)  # NOTE: 1x1 conv
+        self.to_out = torch.nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads=self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w',
+                        heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+class Residual(BaseModule):
+    def __init__(self, fn):
+        super(Residual, self).__init__()
+        self.fn = fn
+    def forward(self, x, *args, **kwargs):
+        output = self.fn(x, *args, **kwargs) + x
+        return output
+class SinusoidalPosEmb(BaseModule):
+    def __init__(self, dim):
+        super(SinusoidalPosEmb, self).__init__()
+        self.dim = dim
+    def forward(self, x, scale=1000):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class GradLogPEstimator2d(BaseModule):
+    def __init__(self, dim, dim_mults=(1, 2, 4), groups=8, spk_emb_dim=64, n_feats=80, pe_scale=1000):
+        super(GradLogPEstimator2d, self).__init__()
+        self.dim = dim
+        self.dim_mults = dim_mults
+        self.groups = groups
+        self.spk_emb_dim = spk_emb_dim
+        self.pe_scale = pe_scale
+        self.spk_mlp = torch.nn.Sequential(torch.nn.Linear(spk_emb_dim, spk_emb_dim * 4), Mish(),
+                                           torch.nn.Linear(spk_emb_dim * 4, n_feats))
+        self.time_pos_emb = SinusoidalPosEmb(dim)
+        self.mlp = torch.nn.Sequential(torch.nn.Linear(dim, dim * 4), Mish(),
+                                       torch.nn.Linear(dim * 4, dim))
+        dims = [3, *map(lambda m: dim * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        self.downs = torch.nn.ModuleList([])
+        self.ups = torch.nn.ModuleList([])
+        num_resolutions = len(in_out)
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (num_resolutions - 1)
+            self.downs.append(torch.nn.ModuleList([
+                       ResnetBlock(dim_in, dim_out, time_emb_dim=dim),
+                       ResnetBlock(dim_out, dim_out, time_emb_dim=dim),
+                       Residual(Rezero(LinearAttention(dim_out))),
+                       Downsample(dim_out) if not is_last else torch.nn.Identity()]))
+        mid_dim = dims[-1]
+        self.mid_block1 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim)
+        self.mid_attn = Residual(Rezero(LinearAttention(mid_dim)))
+        self.mid_block2 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim)
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+            self.ups.append(torch.nn.ModuleList([
+                     ResnetBlock(dim_out * 2, dim_in, time_emb_dim=dim),
+                     ResnetBlock(dim_in, dim_in, time_emb_dim=dim),
+                     Residual(Rezero(LinearAttention(dim_in))),
+                     Upsample(dim_in)]))
+        self.final_block = Block(dim, dim)
+        self.final_conv = torch.nn.Conv2d(dim, 1, 1)
+    def forward(self, x, mask, mu, t, spk=None):
+        # x, mu: [B, 80, L], t: [B, ], mask: [B, 1, L]
+        if not isinstance(spk, type(None)):
+            s = self.spk_mlp(spk)
+        t = self.time_pos_emb(t, scale=self.pe_scale)
+        t = self.mlp(t)  # [B, 64]
+        s = s.unsqueeze(-1).repeat(1, 1, x.shape[-1])
+        x = torch.stack([mu, x, s], 1)  # [B, 3, 80, L]
+        mask = mask.unsqueeze(1)  # [B, 1, 1, L]
+        hiddens = []
+        masks = [mask]
+        for resnet1, resnet2, attn, downsample in self.downs:
+            mask_down = masks[-1]
+            x = resnet1(x, mask_down, t)  # [B, 64, 80, L]
+            x = resnet2(x, mask_down, t)
+            x = attn(x)
+            hiddens.append(x)
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        x = self.mid_block1(x, mask_mid, t)
+        x = self.mid_attn(x)
+        x = self.mid_block2(x, mask_mid, t)
+        for resnet1, resnet2, attn, upsample in self.ups:
+            mask_up = masks.pop()
+            x = torch.cat((x, hiddens.pop()), dim=1)
+            x = resnet1(x, mask_up, t)
+            x = resnet2(x, mask_up, t)
+            x = attn(x)
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask)
+        output = self.final_conv(x * mask)
+        return (output * mask).squeeze(1)
+def get_noise(t, beta_init, beta_term, cumulative=False):
+    if cumulative:
+        noise = beta_init*t + 0.5*(beta_term - beta_init)*(t**2)
+    else:
+        noise = beta_init + (beta_term - beta_init)*t
+    return noise
+class Diffusion(BaseModule):
+    def __init__(self, n_feats, dim, spk_emb_dim=64,
+                 beta_min=0.05, beta_max=20, pe_scale=1000):
+        super(Diffusion, self).__init__()
+        self.n_feats = n_feats
+        self.dim = dim
+        # self.n_spks = n_spks
+        self.spk_emb_dim = spk_emb_dim
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+        self.pe_scale = pe_scale
+        self.estimator = GradLogPEstimator2d(dim,
+                                             spk_emb_dim=spk_emb_dim,
+                                             pe_scale=pe_scale,
+                                             n_feats=n_feats)
+    def forward_diffusion(self, x0, mask, mu, t):
+        time = t.unsqueeze(-1).unsqueeze(-1)
+        cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True)  # it is actually the integral of beta
+        mean = x0*torch.exp(-0.5*cum_noise) + mu*(1.0 - torch.exp(-0.5*cum_noise))
+        variance = 1.0 - torch.exp(-cum_noise)
+        z = torch.randn(x0.shape, dtype=x0.dtype, device=x0.device,
+                        requires_grad=False)
+        xt = mean + z * torch.sqrt(variance)
+        return xt * mask, z * mask
+    @torch.no_grad()
+    def reverse_diffusion(self, z, mask, mu, n_timesteps, stoc=False, spk=None,
+                          use_classifier_free=False,
+                          classifier_free_guidance=3.0,
+                          dummy_spk=None):  # emo need to be merged by spk
+        # looks like a plain Euler-Maruyama method
+        h = 1.0 / n_timesteps
+        xt = z * mask
+        for i in range(n_timesteps):
+            t = (1.0 - (i + 0.5)*h) * torch.ones(z.shape[0], dtype=z.dtype,
+                                                 device=z.device)
+            time = t.unsqueeze(-1).unsqueeze(-1)
+            noise_t = get_noise(time, self.beta_min, self.beta_max,
+                                cumulative=False)
+            if not use_classifier_free:
+                if stoc:  # adds stochastic term
+                    dxt_det = 0.5 * (mu - xt) - self.estimator(xt, mask, mu, t, spk)
+                    dxt_det = dxt_det * noise_t * h
+                    dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device,
+                                           requires_grad=False)
+                    dxt_stoc = dxt_stoc * torch.sqrt(noise_t * h)
+                    dxt = dxt_det + dxt_stoc
+                else:
+                    dxt = 0.5 * (mu - xt - self.estimator(xt, mask, mu, t, spk))
+                    dxt = dxt * noise_t * h
+                xt = (xt - dxt) * mask
+            else:
+                if stoc:  # adds stochastic term
+                    score_estimate = (1 + classifier_free_guidance) * self.estimator(xt, mask, mu, t, spk) \
+                                     - classifier_free_guidance * self.estimator(xt, mask, mu, t, dummy_spk)
+                    dxt_det = 0.5 * (mu - xt) - score_estimate
+                    dxt_det = dxt_det * noise_t * h
+                    dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device,
+                                           requires_grad=False)
+                    dxt_stoc = dxt_stoc * torch.sqrt(noise_t * h)
+                    dxt = dxt_det + dxt_stoc
+                else:
+                    score_estimate = (1 + classifier_free_guidance) * self.estimator(xt, mask, mu, t, spk) \
+                                     - classifier_free_guidance * self.estimator(xt, mask, mu, t, dummy_spk)
+                    dxt = 0.5 * (mu - xt - score_estimate)
+                    dxt = dxt * noise_t * h
+                xt = (xt - dxt) * mask
+        return xt
+    @torch.no_grad()
+    def forward(self, z, mask, mu, n_timesteps, stoc=False, spk=None,
+                use_classifier_free=False,
+                classifier_free_guidance=3.0,
+                dummy_spk=None
+                ):
+        return self.reverse_diffusion(z, mask, mu, n_timesteps, stoc, spk, use_classifier_free, classifier_free_guidance, dummy_spk)
+    def loss_t(self, x0, mask, mu, t, spk=None):
+        xt, z = self.forward_diffusion(x0, mask, mu, t)  # z is sampled from N(0, I)
+        time = t.unsqueeze(-1).unsqueeze(-1)
+        cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True)
+        noise_estimation = self.estimator(xt, mask, mu, t, spk)
+        noise_estimation *= torch.sqrt(1.0 - torch.exp(-cum_noise))  # multiply by lambda which is set to be variance
+        # actually multiplied by sqrt(lambda), but not lambda
+        # NOTE: here use a trick to put lambda into L2 norm so that don't divide z with std.
+        loss = torch.sum((noise_estimation + z)**2) / (torch.sum(mask)*self.n_feats)
+        return loss, xt
+    def compute_loss(self, x0, mask, mu, spk=None, offset=1e-5):
+        t = torch.rand(x0.shape[0], dtype=x0.dtype, device=x0.device,
+                       requires_grad=False)
+        t = torch.clamp(t, offset, 1.0 - offset)
+        return self.loss_t(x0, mask, mu, t, spk)
+    def classifier_decode(self, z, mask, mu, n_timesteps, stoc=False, spk=None, classifier_func=None, guidance=1.0, control_emo=None, classifier_type="conformer"):
+        # control_emo should be [B, ] tensor
+        h = 1.0 / n_timesteps
+        xt = z * mask
+        for i in range(n_timesteps):
+            t = (1.0 - (i + 0.5) * h) * torch.ones(z.shape[0], dtype=z.dtype,
+                                                   device=z.device)
+            time = t.unsqueeze(-1).unsqueeze(-1)
+            noise_t = get_noise(time, self.beta_min, self.beta_max,
+                                cumulative=False)
+            # =========== classifier part ==============
+            xt = xt.detach()
+            xt.requires_grad_(True)
+            if classifier_type == 'CNN-with-time':
+                logits = classifier_func(xt.transpose(1, 2), mu.transpose(1, 2), (mask == 1.0).squeeze(1), t=t)
+            else:
+                logits = classifier_func(xt.transpose(1, 2), mu.transpose(1, 2), (mask == 1.0).squeeze(1))
+            if classifier_type == 'conformer':  # [B, C]
+                probs = torch.log_softmax(logits, dim=-1)  # [B, C]
+            elif classifier_type == 'CNN' or classifier_type == 'CNN-with-time' :
+                probs_every_place = torch.softmax(logits, dim=-1)  # [B, T', C]
+                probs_mean = torch.mean(probs_every_place, dim=1)  # [B, C]
+                probs = torch.log(probs_mean)
+            else:
+                raise NotImplementedError
+            control_emo_probs = probs[torch.arange(len(control_emo)).to(control_emo.device), control_emo]
+            control_emo_probs.sum().backward(retain_graph=True)
+            # NOTE: sum is to treat all the components as the same weight.
+            xt_grad = xt.grad
+            # ==========================================
+            if stoc:  # adds stochastic term
+                dxt_det = 0.5 * (mu - xt) - self.estimator(xt, mask, mu, t, spk) - guidance * xt_grad
+                dxt_det = dxt_det * noise_t * h
+                dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device,
+                                       requires_grad=False)
+                dxt_stoc = dxt_stoc * torch.sqrt(noise_t * h)
+                dxt = dxt_det + dxt_stoc
+            else:
+                dxt = 0.5 * (mu - xt - self.estimator(xt, mask, mu, t, spk) - guidance * xt_grad)
+                dxt = dxt * noise_t * h
+            xt = (xt - dxt) * mask
+        return xt
+    def classifier_decode_DPS(self, z, mask, mu, n_timesteps, stoc=False, spk=None, classifier_func=None, guidance=1.0, control_emo=None, classifier_type="conformer"):
+        # control_emo should be [B, ] tensor
+        h = 1.0 / n_timesteps
+        xt = z * mask
+        for i in range(n_timesteps):
+            t = (1.0 - (i + 0.5) * h) * torch.ones(z.shape[0], dtype=z.dtype, device=z.device)
+            time = t.unsqueeze(-1).unsqueeze(-1)
+            noise_t = get_noise(time, self.beta_min, self.beta_max, cumulative=False)
+            beta_integral_t = get_noise(time, self.beta_min, self.beta_max, cumulative=True)
+            bar_alpha_t = math.exp(-beta_integral_t)
+            # =========== classifier part ==============
+            xt = xt.detach()
+            xt.requires_grad_(True)
+            score_estimate = self.estimator(xt, mask, mu, t, spk)
+            x0_hat = (xt + (1-bar_alpha_t) * score_estimate) / math.sqrt(bar_alpha_t)
+            if classifier_type == 'CNN-with-time':
+                raise NotImplementedError
+            else:
+                logits = classifier_func(x0_hat.transpose(1, 2), mu.transpose(1, 2), (mask == 1.0).squeeze(1))
+            if classifier_type == 'conformer':  # [B, C]
+                probs = torch.log_softmax(logits, dim=-1)  # [B, C]
+            elif classifier_type == 'CNN':
+                probs_every_place = torch.softmax(logits, dim=-1)  # [B, T', C]
+                probs_mean = torch.mean(probs_every_place, dim=1)  # [B, C]
+                probs_mean = probs_mean + 10E-10
+                # NOTE: at the first few steps, x0 may be very large. Then the classifier output logits will also have extreme value range.
+                #
+                probs = torch.log(probs_mean)
+            else:
+                raise NotImplementedError
+            control_emo_probs = probs[torch.arange(len(control_emo)).to(control_emo.device), control_emo]
+            control_emo_probs.sum().backward(retain_graph=True)
+            # NOTE: sum is to treat all the components as the same weight.
+            xt_grad = xt.grad
+            # ==========================================
+            if stoc:  # adds stochastic term
+                dxt_det = 0.5 * (mu - xt) - score_estimate - guidance * xt_grad
+                dxt_det = dxt_det * noise_t * h
+                dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device, requires_grad=False)
+                dxt_stoc = dxt_stoc * torch.sqrt(noise_t * h)
+                dxt = dxt_det + dxt_stoc
+            else:
+                dxt = 0.5 * (mu - xt - score_estimate - guidance * xt_grad)
+                dxt = dxt * noise_t * h
+            xt = (xt - dxt) * mask
+        return xt
+    def classifier_decode_mixture(self, z, mask, mu, n_timesteps, stoc=False, spk=None, classifier_func=None, guidance=1.0, control_emo1=None,control_emo2=None, emo1_weight=None, classifier_type="conformer"):
+        # control_emo should be [B, ] tensor
+        h = 1.0 / n_timesteps
+        xt = z * mask
+        for i in range(n_timesteps):
+            t = (1.0 - (i + 0.5) * h) * torch.ones(z.shape[0], dtype=z.dtype,
+                                                   device=z.device)
+            time = t.unsqueeze(-1).unsqueeze(-1)
+            noise_t = get_noise(time, self.beta_min, self.beta_max,
+                                cumulative=False)
+            # =========== classifier part ==============
+            xt = xt.detach()
+            xt.requires_grad_(True)
+            if classifier_type == 'CNN-with-time':
+                logits = classifier_func(xt.transpose(1, 2), mu.transpose(1, 2), (mask == 1.0).squeeze(1), t=t)
+            else:
+                logits = classifier_func(xt.transpose(1, 2), mu.transpose(1, 2), (mask == 1.0).squeeze(1))
+            if classifier_type == 'conformer':  # [B, C]
+                probs = torch.log_softmax(logits, dim=-1)  # [B, C]
+            elif classifier_type == 'CNN' or classifier_type == 'CNN-with-time' :
+                probs_every_place = torch.softmax(logits, dim=-1)  # [B, T', C]
+                probs_mean = torch.mean(probs_every_place, dim=1)  # [B, C]
+                probs = torch.log(probs_mean)
+            else:
+                raise NotImplementedError
+            control_emo_probs1 = probs[torch.arange(len(control_emo1)).to(control_emo1.device), control_emo1]
+            control_emo_probs2 = probs[torch.arange(len(control_emo2)).to(control_emo2.device), control_emo2]
+            control_emo_probs = control_emo_probs1 * emo1_weight + control_emo_probs2 * (1-emo1_weight)  # interpolate
+            control_emo_probs.sum().backward(retain_graph=True)
+            # NOTE: sum is to treat all the components as the same weight.
+            xt_grad = xt.grad
+            # ==========================================
+            if stoc:  # adds stochastic term
+                dxt_det = 0.5 * (mu - xt) - self.estimator(xt, mask, mu, t, spk) - guidance * xt_grad
+                dxt_det = dxt_det * noise_t * h
+                dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device,
+                                       requires_grad=False)
+                dxt_stoc = dxt_stoc * torch.sqrt(noise_t * h)
+                dxt = dxt_det + dxt_stoc
+            else:
+                dxt = 0.5 * (mu - xt - self.estimator(xt, mask, mu, t, spk) - guidance * xt_grad)
+                dxt = dxt * noise_t * h
+            xt = (xt - dxt) * mask
+        return xt
+    def classifier_decode_mixture_DPS(self, z, mask, mu, n_timesteps, stoc=False, spk=None, classifier_func=None, guidance=1.0, control_emo1=None,control_emo2=None, emo1_weight=None, classifier_type="conformer"):
+        # control_emo should be [B, ] tensor
+        h = 1.0 / n_timesteps
+        xt = z * mask
+        for i in range(n_timesteps):
+            t = (1.0 - (i + 0.5) * h) * torch.ones(z.shape[0], dtype=z.dtype,
+                                                   device=z.device)
+            time = t.unsqueeze(-1).unsqueeze(-1)
+            noise_t = get_noise(time, self.beta_min, self.beta_max,
+                                cumulative=False)
+            beta_integral_t = get_noise(time, self.beta_min, self.beta_max, cumulative=True)
+            bar_alpha_t = math.exp(-beta_integral_t)
+            # =========== classifier part ==============
+            xt = xt.detach()
+            xt.requires_grad_(True)
+            score_estimate = self.estimator(xt, mask, mu, t, spk)
+            x0_hat = (xt + (1 - bar_alpha_t) * score_estimate) / math.sqrt(bar_alpha_t)
+            if classifier_type == 'CNN-with-time':
+                raise NotImplementedError
+            else:
+                logits = classifier_func(x0_hat.transpose(1, 2), mu.transpose(1, 2), (mask == 1.0).squeeze(1))
+            if classifier_type == 'conformer':  # [B, C]
+                probs = torch.log_softmax(logits, dim=-1)  # [B, C]
+            elif classifier_type == 'CNN' or classifier_type == 'CNN-with-time' :
+                probs_every_place = torch.softmax(logits, dim=-1)  # [B, T', C]
+                probs_mean = torch.mean(probs_every_place, dim=1)  # [B, C]
+                probs_mean = probs_mean + 10E-10
+                probs = torch.log(probs_mean)
+            else:
+                raise NotImplementedError
+            control_emo_probs1 = probs[torch.arange(len(control_emo1)).to(control_emo1.device), control_emo1]
+            control_emo_probs2 = probs[torch.arange(len(control_emo2)).to(control_emo2.device), control_emo2]
+            control_emo_probs = control_emo_probs1 * emo1_weight + control_emo_probs2 * (1-emo1_weight)  # interpolate
+            control_emo_probs.sum().backward(retain_graph=True)
+            # NOTE: sum is to treat all the components as the same weight.
+            xt_grad = xt.grad
+            # ==========================================
+            if stoc:  # adds stochastic term
+                dxt_det = 0.5 * (mu - xt) - score_estimate - guidance * xt_grad
+                dxt_det = dxt_det * noise_t * h
+                dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device,
+                                       requires_grad=False)
+                dxt_stoc = dxt_stoc * torch.sqrt(noise_t * h)
+                dxt = dxt_det + dxt_stoc
+            else:
+                dxt = 0.5 * (mu - xt - score_estimate - guidance * xt_grad)
+                dxt = dxt * noise_t * h
+            xt = (xt - dxt) * mask
+        return xt

model/monotonic_align/LICENCE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2020 Jaehyeon Kim
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

model/monotonic_align/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+""" from https://github.com/jaywalnut310/glow-tts """
+import numpy as np
+import torch
+from .model.monotonic_align.core import maximum_path_c
+def maximum_path(value, mask):
+    """ Cython optimised version.
+    value: [b, t_x, t_y]
+    mask: [b, t_x, t_y]
+    """
+    value = value * mask
+    device = value.device
+    dtype = value.dtype
+    value = value.data.cpu().numpy().astype(np.float32)
+    path = np.zeros_like(value).astype(np.int32)
+    mask = mask.data.cpu().numpy()
+    t_x_max = mask.sum(1)[:, 0].astype(np.int32)
+    t_y_max = mask.sum(2)[:, 0].astype(np.int32)
+    maximum_path_c(path, value, t_x_max, t_y_max)
+    return torch.from_numpy(path).to(device=device, dtype=dtype)

model/monotonic_align/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (903 Bytes). View file

model/monotonic_align/build/lib.macosx-11.1-arm64-cpython-310/model/monotonic_align/core.cpython-310-darwin.so ADDED Viewed

Binary file (162 kB). View file

model/monotonic_align/build/temp.linux-x86_64-3.6/core.o ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b523df88cfc72d08a590c92052df8d4e252bfec3edc67981099a163f5a112ed
+size 2200712

model/monotonic_align/build/temp.macosx-10.9-x86_64-3.6/core.o ADDED Viewed

Binary file (616 kB). View file

model/monotonic_align/build/temp.macosx-11.1-arm64-cpython-310/core.o ADDED Viewed

Binary file (173 kB). View file

model/monotonic_align/core.c ADDED Viewed

The diff for this file is too large to render. See raw diff

model/monotonic_align/core.pyx ADDED Viewed

	@@ -0,0 +1,45 @@

+import numpy as np
+cimport numpy as np
+cimport cython
+from cython.parallel import prange
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil:
+    cdef int x
+    cdef int y
+    cdef float v_prev
+    cdef float v_cur
+    cdef float tmp
+    cdef int index = t_x - 1
+    for y in range(t_y):
+        for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
+            if x == y:
+                v_cur = max_neg_val
+            else:
+                v_cur = value[x, y-1]
+            if x == 0:
+                if y == 0:
+                    v_prev = 0.
+                else:
+                    v_prev = max_neg_val
+            else:
+                v_prev = value[x-1, y-1]
+            value[x, y] = max(v_cur, v_prev) + value[x, y]
+    for y in range(t_y - 1, -1, -1):
+        path[index, y] = 1
+        if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
+            index = index - 1
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
+    cdef int b = values.shape[0]
+    cdef int i
+    for i in prange(b, nogil=True):
+        maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)

model/monotonic_align/model/monotonic_align/core.cpython-310-darwin.so ADDED Viewed

Binary file (162 kB). View file

model/monotonic_align/setup.py ADDED Viewed

	@@ -0,0 +1,11 @@

+""" from https://github.com/jaywalnut310/glow-tts """
+from distutils.core import setup
+from Cython.Build import cythonize
+import numpy
+setup(
+    name = 'monotonic_align',
+    ext_modules = cythonize("core.pyx"),
+    include_dirs=[numpy.get_include()]
+)

model/text_encoder.py ADDED Viewed

	@@ -0,0 +1,326 @@

+""" from https://github.com/jaywalnut310/glow-tts """
+import math
+import torch
+from model.base import BaseModule
+from model.utils import sequence_mask, convert_pad_shape
+class LayerNorm(BaseModule):
+    def __init__(self, channels, eps=1e-4):
+        super(LayerNorm, self).__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = torch.nn.Parameter(torch.ones(channels))
+        self.beta = torch.nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        n_dims = len(x.shape)
+        mean = torch.mean(x, 1, keepdim=True)
+        variance = torch.mean((x - mean)**2, 1, keepdim=True)
+        x = (x - mean) * torch.rsqrt(variance + self.eps)
+        shape = [1, -1] + [1] * (n_dims - 2)
+        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+        return x
+class ConvReluNorm(BaseModule):
+    def __init__(self, in_channels, hidden_channels, out_channels, kernel_size,
+                 n_layers, p_dropout):
+        super(ConvReluNorm, self).__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        self.conv_layers = torch.nn.ModuleList()
+        self.norm_layers = torch.nn.ModuleList()
+        self.conv_layers.append(torch.nn.Conv1d(in_channels, hidden_channels,
+                                                kernel_size, padding=kernel_size//2))
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = torch.nn.Sequential(torch.nn.ReLU(), torch.nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(torch.nn.Conv1d(hidden_channels, hidden_channels,
+                                                    kernel_size, padding=kernel_size//2))
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = torch.nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask
+class DurationPredictor(BaseModule):
+    def __init__(self, in_channels, filter_channels, kernel_size, p_dropout):
+        super(DurationPredictor, self).__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.p_dropout = p_dropout
+        self.drop = torch.nn.Dropout(p_dropout)
+        self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels,
+                                      kernel_size, padding=kernel_size//2)
+        self.norm_1 = LayerNorm(filter_channels)
+        self.conv_2 = torch.nn.Conv1d(filter_channels, filter_channels,
+                                      kernel_size, padding=kernel_size//2)
+        self.norm_2 = LayerNorm(filter_channels)
+        self.proj = torch.nn.Conv1d(filter_channels, 1, 1)
+    def forward(self, x, x_mask):
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        x = self.proj(x * x_mask)
+        return x * x_mask
+class MultiHeadAttention(BaseModule):
+    def __init__(self, channels, out_channels, n_heads, window_size=None,
+                 heads_share=True, p_dropout=0.0, proximal_bias=False,
+                 proximal_init=False):
+        super(MultiHeadAttention, self).__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.proximal_bias = proximal_bias
+        self.p_dropout = p_dropout
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = torch.nn.Conv1d(channels, channels, 1)
+        self.conv_k = torch.nn.Conv1d(channels, channels, 1)
+        self.conv_v = torch.nn.Conv1d(channels, channels, 1)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = torch.nn.Parameter(torch.randn(n_heads_rel,
+                             window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = torch.nn.Parameter(torch.randn(n_heads_rel,
+                             window_size * 2 + 1, self.k_channels) * rel_stddev)
+        self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
+        self.drop = torch.nn.Dropout(p_dropout)
+        torch.nn.init.xavier_uniform_(self.conv_q.weight)
+        torch.nn.init.xavier_uniform_(self.conv_k.weight)
+        if proximal_init:
+            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
+            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
+        torch.nn.init.xavier_uniform_(self.conv_v.weight)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels)
+        if self.window_size is not None:
+            assert t_s == t_t, "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings)
+            rel_logits = self._relative_position_to_absolute_position(rel_logits)
+            scores_local = rel_logits / math.sqrt(self.k_channels)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device,
+                                                                    dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+        p_attn = torch.nn.functional.softmax(scores, dim=-1)
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+            output = output + self._matmul_with_relative_values(relative_weights,
+                                                                value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(b, d, t_t)
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = torch.nn.functional.pad(
+                            relative_embeddings, convert_pad_shape([[0, 0],
+                            [pad_length, pad_length], [0, 0]]))
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[:,
+                                   slice_start_position:slice_end_position]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = torch.nn.functional.pad(x, convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = torch.nn.functional.pad(x_flat, convert_pad_shape([[0,0],[0,0],[0,length-1]]))
+        x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = torch.nn.functional.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
+        x_flat = x.view([batch, heads, length**2 + length*(length - 1)])
+        x_flat = torch.nn.functional.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(BaseModule):
+    def __init__(self, in_channels, out_channels, filter_channels, kernel_size,
+                 p_dropout=0.0):
+        super(FFN, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size,
+                                      padding=kernel_size//2)
+        self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size,
+                                      padding=kernel_size//2)
+        self.drop = torch.nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        return x * x_mask
+class Encoder(BaseModule):
+    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers,
+                 kernel_size=1, p_dropout=0.0, window_size=None, **kwargs):
+        super(Encoder, self).__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.drop = torch.nn.Dropout(p_dropout)
+        self.attn_layers = torch.nn.ModuleList()
+        self.norm_layers_1 = torch.nn.ModuleList()
+        self.ffn_layers = torch.nn.ModuleList()
+        self.norm_layers_2 = torch.nn.ModuleList()
+        for _ in range(self.n_layers):
+            self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels,
+                                    n_heads, window_size=window_size, p_dropout=p_dropout))
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(FFN(hidden_channels, hidden_channels,
+                                       filter_channels, kernel_size, p_dropout=p_dropout))
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        for i in range(self.n_layers):
+            x = x * x_mask
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class TextEncoder(BaseModule):
+    def __init__(self, n_vocab, n_feats, n_channels, filter_channels,
+                 filter_channels_dp, n_heads, n_layers, kernel_size,
+                 p_dropout, window_size=None, spk_emb_dim=64, n_spks=1):
+        super(TextEncoder, self).__init__()
+        self.n_vocab = n_vocab
+        self.n_feats = n_feats
+        self.n_channels = n_channels
+        self.filter_channels = filter_channels
+        self.filter_channels_dp = filter_channels_dp
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.spk_emb_dim = spk_emb_dim
+        self.n_spks = n_spks
+        self.emb = torch.nn.Embedding(n_vocab, n_channels)
+        torch.nn.init.normal_(self.emb.weight, 0.0, n_channels**-0.5)
+        self.prenet = ConvReluNorm(n_channels, n_channels, n_channels,
+                                   kernel_size=5, n_layers=3, p_dropout=0.5)
+        self.encoder = Encoder(n_channels + (spk_emb_dim if n_spks > 1 else 0), filter_channels, n_heads, n_layers,
+                               kernel_size, p_dropout, window_size=window_size)
+        self.proj_m = torch.nn.Conv1d(n_channels + (spk_emb_dim if n_spks > 1 else 0), n_feats, 1)
+        self.proj_w = DurationPredictor(n_channels + (spk_emb_dim if n_spks > 1 else 0), filter_channels_dp,
+                                        kernel_size, p_dropout)
+    def forward(self, x, x_lengths, spk=None):
+        x = self.emb(x) * math.sqrt(self.n_channels)
+        x = torch.transpose(x, 1, -1)
+        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x = self.prenet(x, x_mask)
+        if self.n_spks > 1:
+            x = torch.cat([x, spk.unsqueeze(-1).repeat(1, 1, x.shape[-1])], dim=1)
+        x = self.encoder(x, x_mask)
+        mu = self.proj_m(x) * x_mask
+        x_dp = torch.detach(x)
+        logw = self.proj_w(x_dp, x_mask)
+        return mu, logw, x_mask

model/tts.py ADDED Viewed

	@@ -0,0 +1,558 @@

+import math
+import random
+import torch
+from model import monotonic_align
+from model.base import BaseModule
+from model.text_encoder import TextEncoder
+from model.diffusion import Diffusion
+from model.utils import sequence_mask, generate_path, duration_loss, fix_len_compatibility
+class GradTTSWithEmo(BaseModule):
+    def __init__(self, n_vocab=148, n_spks=1,n_emos=5, spk_emb_dim=64,
+                 n_enc_channels=192, filter_channels=768, filter_channels_dp=256,
+                 n_heads=2, n_enc_layers=6, enc_kernel=3, enc_dropout=0.1, window_size=4,
+                 n_feats=80, dec_dim=64, beta_min=0.05, beta_max=20.0, pe_scale=1000,
+                 use_classifier_free=False, dummy_spk_rate=0.5,
+                 **kwargs):
+        super(GradTTSWithEmo, self).__init__()
+        self.n_vocab = n_vocab
+        self.n_spks = n_spks
+        self.n_emos = n_emos
+        self.spk_emb_dim = spk_emb_dim
+        self.n_enc_channels = n_enc_channels
+        self.filter_channels = filter_channels
+        self.filter_channels_dp = filter_channels_dp
+        self.n_heads = n_heads
+        self.n_enc_layers = n_enc_layers
+        self.enc_kernel = enc_kernel
+        self.enc_dropout = enc_dropout
+        self.window_size = window_size
+        self.n_feats = n_feats
+        self.dec_dim = dec_dim
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+        self.pe_scale = pe_scale
+        self.use_classifier_free = use_classifier_free
+        # if n_spks > 1:
+        self.spk_emb = torch.nn.Embedding(n_spks, spk_emb_dim)
+        self.emo_emb = torch.nn.Embedding(n_emos, spk_emb_dim)
+        self.merge_spk_emo = torch.nn.Sequential(
+            torch.nn.Linear(spk_emb_dim*2, spk_emb_dim),
+            torch.nn.ReLU(),
+            torch.nn.Linear(spk_emb_dim, spk_emb_dim)
+        )
+        self.encoder = TextEncoder(n_vocab, n_feats, n_enc_channels,
+                                   filter_channels, filter_channels_dp, n_heads,
+                                   n_enc_layers, enc_kernel, enc_dropout, window_size,
+                                   spk_emb_dim=spk_emb_dim, n_spks=n_spks)
+        self.decoder = Diffusion(n_feats, dec_dim, spk_emb_dim, beta_min, beta_max, pe_scale)
+        if self.use_classifier_free:
+            self.dummy_xv = torch.nn.Parameter(torch.randn(size=(spk_emb_dim, )))
+            self.dummy_rate = dummy_spk_rate
+            print(f"Using classifier free with rate {self.dummy_rate}")
+    @torch.no_grad()
+    def forward(self, x, x_lengths, n_timesteps, temperature=1.0, stoc=False, spk=None, emo=None,
+                length_scale=1.0,  classifier_free_guidance=1., force_dur=None):
+        """
+        Generates mel-spectrogram from text. Returns:
+            1. encoder outputs
+            2. decoder outputs
+            3. generated alignment
+        Args:
+            x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids.
+            x_lengths (torch.Tensor): lengths of texts in batch.
+            n_timesteps (int): number of steps to use for reverse diffusion in decoder.
+            temperature (float, optional): controls variance of terminal distribution.
+            stoc (bool, optional): flag that adds stochastic term to the decoder sampler.
+                Usually, does not provide synthesis improvements.
+            length_scale (float, optional): controls speech pace.
+                Increase value to slow down generated speech and vice versa.
+        """
+        x, x_lengths = self.relocate_input([x, x_lengths])
+        # Get speaker embedding
+        spk = self.spk_emb(spk)
+        emo = self.emo_emb(emo)
+        if self.use_classifier_free:
+            emo = emo / torch.sqrt(torch.sum(emo**2, dim=1, keepdim=True))  # unit norm
+        spk_merged = self.merge_spk_emo(torch.cat([spk, emo], dim=-1))
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spk_merged)
+        w = torch.exp(logw) * x_mask
+        w_ceil = torch.ceil(w) * length_scale
+        if force_dur is not None:
+            w_ceil = force_dur.unsqueeze(1)  # [1, 1, Ltext]
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_max_length = int(y_lengths.max())
+        y_max_length_ = fix_len_compatibility(y_max_length)
+        # Using obtained durations `w` construct alignment map `attn`
+        y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
+        # Align encoded text and get mu_y
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        encoder_outputs = mu_y[:, :, :y_max_length]
+        # Sample latent representation from terminal distribution N(mu_y, I)
+        z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature
+        # print(z)
+        # Generate sample by performing reverse dynamics
+        unit_dummy_emo = self.dummy_xv / torch.sqrt(torch.sum(self.dummy_xv**2)) if self.use_classifier_free else None
+        dummy_spk = self.merge_spk_emo(torch.cat([spk, unit_dummy_emo.unsqueeze(0).repeat(len(spk), 1)], dim=-1)) if self.use_classifier_free else None
+        decoder_outputs = self.decoder(z, y_mask, mu_y, n_timesteps, stoc, spk_merged,
+                                       use_classifier_free=self.use_classifier_free,
+                                       classifier_free_guidance=classifier_free_guidance,
+                                       dummy_spk=dummy_spk)
+        decoder_outputs = decoder_outputs[:, :, :y_max_length]
+        return encoder_outputs, decoder_outputs, attn[:, :, :y_max_length]
+    def classifier_guidance_decode(self, x, x_lengths, n_timesteps, temperature=1.0, stoc=False, spk=None, emo=None,
+                                   length_scale=1.0, classifier_func=None, guidance=1.0, classifier_type='conformer'):
+        x, x_lengths = self.relocate_input([x, x_lengths])
+        # Get speaker embedding
+        spk = self.spk_emb(spk)
+        dummy_emo = self.emo_emb(torch.zeros_like(emo).long())  # this is for feeding the text encoder.
+        spk_merged = self.merge_spk_emo(torch.cat([spk, dummy_emo], dim=-1))
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spk_merged)
+        w = torch.exp(logw) * x_mask
+        # print("w shape is ", w.shape)
+        w_ceil = torch.ceil(w) * length_scale
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_max_length = int(y_lengths.max())
+        if classifier_type == 'CNN' or classifier_type == 'CNN-with-time' :
+            y_max_length = max(y_max_length, 180)  # NOTE: added for CNN classifier
+        y_max_length_ = fix_len_compatibility(y_max_length)
+        # Using obtained durations `w` construct alignment map `attn`
+        y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
+        # Align encoded text and get mu_y
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        encoder_outputs = mu_y[:, :, :y_max_length]
+        # Sample latent representation from terminal distribution N(mu_y, I)
+        z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature
+        # Generate sample by performing reverse dynamics
+        decoder_outputs = self.decoder.classifier_decode(z, y_mask, mu_y, n_timesteps, stoc, spk_merged,
+                                                         classifier_func, guidance,
+                                                         control_emo=emo, classifier_type=classifier_type)
+        decoder_outputs = decoder_outputs[:, :, :y_max_length]
+        return encoder_outputs, decoder_outputs, attn[:, :, :y_max_length]
+    def classifier_guidance_decode_DPS(self, x, x_lengths, n_timesteps, temperature=1.0, stoc=False, spk=None, emo=None,
+                                   length_scale=1.0, classifier_func=None, guidance=1.0, classifier_type='conformer'):
+        x, x_lengths = self.relocate_input([x, x_lengths])
+        # Get speaker embedding
+        spk = self.spk_emb(spk)
+        dummy_emo = self.emo_emb(torch.zeros_like(emo).long())  # this is for feeding the text encoder.
+        spk_merged = self.merge_spk_emo(torch.cat([spk, dummy_emo], dim=-1))
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spk_merged)
+        w = torch.exp(logw) * x_mask
+        w_ceil = torch.ceil(w) * length_scale
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_max_length = int(y_lengths.max())
+        if classifier_type == 'CNN' or classifier_type == 'CNN-with-time' :
+            y_max_length = max(y_max_length, 180)  # NOTE: added for CNN classifier
+        y_max_length_ = fix_len_compatibility(y_max_length)
+        # Using obtained durations `w` construct alignment map `attn`
+        y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
+        # Align encoded text and get mu_y
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        encoder_outputs = mu_y[:, :, :y_max_length]
+        # Sample latent representation from terminal distribution N(mu_y, I)
+        z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature
+        # Generate sample by performing reverse dynamics
+        decoder_outputs = self.decoder.classifier_decode_DPS(z, y_mask, mu_y, n_timesteps, stoc, spk_merged,
+                                                         classifier_func, guidance,
+                                                         control_emo=emo, classifier_type=classifier_type)
+        decoder_outputs = decoder_outputs[:, :, :y_max_length]
+        return encoder_outputs, decoder_outputs, attn[:, :, :y_max_length]
+    def classifier_guidance_decode_two_mixture(self, x, x_lengths, n_timesteps, temperature=1.0, stoc=False, spk=None, emo1=None, emo2=None, emo1_weight=None,
+                                   length_scale=1.0, classifier_func=None, guidance=1.0, classifier_type='conformer'):
+        x, x_lengths = self.relocate_input([x, x_lengths])
+        # Get speaker embedding
+        spk = self.spk_emb(spk)
+        dummy_emo = self.emo_emb(torch.zeros_like(emo1).long())  # this is for feeding the text encoder.
+        spk_merged = self.merge_spk_emo(torch.cat([spk, dummy_emo], dim=-1))
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spk_merged)
+        w = torch.exp(logw) * x_mask
+        w_ceil = torch.ceil(w) * length_scale
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_max_length = int(y_lengths.max())
+        if classifier_type == 'CNN' or classifier_type == 'CNN-with-time' :
+            y_max_length = max(y_max_length, 180)  # NOTE: added for CNN classifier
+        y_max_length_ = fix_len_compatibility(y_max_length)
+        # Using obtained durations `w` construct alignment map `attn`
+        y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
+        # Align encoded text and get mu_y
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        encoder_outputs = mu_y[:, :, :y_max_length]
+        # Sample latent representation from terminal distribution N(mu_y, I)
+        z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature
+        # Generate sample by performing reverse dynamics
+        decoder_outputs = self.decoder.classifier_decode_mixture(z, y_mask, mu_y, n_timesteps, stoc, spk_merged,
+                                                         classifier_func, guidance,
+                                                         control_emo1=emo1, control_emo2=emo2, emo1_weight=emo1_weight, classifier_type=classifier_type)
+        decoder_outputs = decoder_outputs[:, :, :y_max_length]
+        return encoder_outputs, decoder_outputs, attn[:, :, :y_max_length]
+    def classifier_guidance_decode_two_mixture_DPS(self, x, x_lengths, n_timesteps, temperature=1.0, stoc=False, spk=None, emo1=None, emo2=None, emo1_weight=None,
+                                   length_scale=1.0, classifier_func=None, guidance=1.0, classifier_type='conformer'):
+        x, x_lengths = self.relocate_input([x, x_lengths])
+        # Get speaker embedding
+        spk = self.spk_emb(spk)
+        dummy_emo = self.emo_emb(torch.zeros_like(emo1).long())  # this is for feeding the text encoder.
+        spk_merged = self.merge_spk_emo(torch.cat([spk, dummy_emo], dim=-1))
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spk_merged)
+        w = torch.exp(logw) * x_mask
+        w_ceil = torch.ceil(w) * length_scale
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_max_length = int(y_lengths.max())
+        if classifier_type == 'CNN' or classifier_type == 'CNN-with-time' :
+            y_max_length = max(y_max_length, 180)  # NOTE: added for CNN classifier
+        y_max_length_ = fix_len_compatibility(y_max_length)
+        # Using obtained durations `w` construct alignment map `attn`
+        y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
+        # Align encoded text and get mu_y
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        encoder_outputs = mu_y[:, :, :y_max_length]
+        # Sample latent representation from terminal distribution N(mu_y, I)
+        z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature
+        # Generate sample by performing reverse dynamics
+        decoder_outputs = self.decoder.classifier_decode_mixture_DPS(z, y_mask, mu_y, n_timesteps, stoc, spk_merged,
+                                                         classifier_func, guidance,
+                                                         control_emo1=emo1, control_emo2=emo2, emo1_weight=emo1_weight, classifier_type=classifier_type)
+        decoder_outputs = decoder_outputs[:, :, :y_max_length]
+        return encoder_outputs, decoder_outputs, attn[:, :, :y_max_length]
+    def compute_loss(self, x, x_lengths, y, y_lengths, spk=None, emo=None, out_size=None, use_gt_dur=False, durs=None):
+        """
+        Computes 3 losses:
+            1. duration loss: loss between predicted token durations and those extracted by Monotinic Alignment Search (MAS).
+            2. prior loss: loss between mel-spectrogram and encoder outputs.
+            3. diffusion loss: loss between gaussian noise and its reconstruction by diffusion-based decoder.
+        Args:
+            x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids.
+            x_lengths (torch.Tensor): lengths of texts in batch.
+            y (torch.Tensor): batch of corresponding mel-spectrograms.
+            y_lengths (torch.Tensor): lengths of mel-spectrograms in batch.
+            out_size (int, optional): length (in mel's sampling rate) of segment to cut, on which decoder will be trained.
+                Should be divisible by 2^{num of UNet downsamplings}. Needed to increase batch size.
+            use_gt_dur: bool
+            durs: gt duration
+        """
+        x, x_lengths, y, y_lengths = self.relocate_input([x, x_lengths, y, y_lengths])  # y: B, 80, L
+        spk = self.spk_emb(spk)
+        emo = self.emo_emb(emo)  # [B, D]
+        if self.use_classifier_free:
+            emo = emo / torch.sqrt(torch.sum(emo ** 2, dim=1, keepdim=True))  # unit norm
+            use_dummy_per_sample = torch.distributions.Binomial(1, torch.tensor(
+                [self.dummy_rate] * len(emo))).sample().bool()  # [b, ] True/False where True accords to rate
+            emo[use_dummy_per_sample] = (self.dummy_xv / torch.sqrt(
+                torch.sum(self.dummy_xv ** 2)))  # substitute with dummy xv(unit norm too)
+        spk = self.merge_spk_emo(torch.cat([spk, emo], dim=-1))  # [B, D]
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spk)
+        y_max_length = y.shape[-1]
+        y_mask = sequence_mask(y_lengths, y_max_length).unsqueeze(1).to(x_mask)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        # Use MAS to find most likely alignment `attn` between text and mel-spectrogram
+        if use_gt_dur:
+            attn = generate_path(durs, attn_mask.squeeze(1)).detach()
+        else:
+            with torch.no_grad():
+                const = -0.5 * math.log(2 * math.pi) * self.n_feats
+                factor = -0.5 * torch.ones(mu_x.shape, dtype=mu_x.dtype, device=mu_x.device)
+                y_square = torch.matmul(factor.transpose(1, 2), y ** 2)
+                y_mu_double = torch.matmul(2.0 * (factor * mu_x).transpose(1, 2), y)
+                mu_square = torch.sum(factor * (mu_x ** 2), 1).unsqueeze(-1)
+                log_prior = y_square - y_mu_double + mu_square + const
+                # it's actually the log likelihood of y given the Gaussian with (mu_x, I)
+                attn = monotonic_align.maximum_path(log_prior, attn_mask.squeeze(1))
+                attn = attn.detach()
+        # Compute loss between predicted log-scaled durations and those obtained from MAS
+        logw_ = torch.log(1e-8 + torch.sum(attn.unsqueeze(1), -1)) * x_mask
+        dur_loss = duration_loss(logw, logw_, x_lengths)
+        # print(attn.shape)
+        # Cut a small segment of mel-spectrogram in order to increase batch size
+        if not isinstance(out_size, type(None)):
+            clip_size = min(out_size, y_max_length)  # when out_size > max length, do not actually perform clipping
+            clip_size = -fix_len_compatibility(-clip_size)  # this is to ensure dividable
+            max_offset = (y_lengths - clip_size).clamp(0)
+            offset_ranges = list(zip([0] * max_offset.shape[0], max_offset.cpu().numpy()))
+            out_offset = torch.LongTensor([
+                torch.tensor(random.choice(range(start, end)) if end > start else 0)
+                for start, end in offset_ranges
+            ]).to(y_lengths)
+            attn_cut = torch.zeros(attn.shape[0], attn.shape[1], clip_size, dtype=attn.dtype, device=attn.device)
+            y_cut = torch.zeros(y.shape[0], self.n_feats, clip_size, dtype=y.dtype, device=y.device)
+            y_cut_lengths = []
+            for i, (y_, out_offset_) in enumerate(zip(y, out_offset)):
+                y_cut_length = clip_size + (y_lengths[i] - clip_size).clamp(None, 0)
+                y_cut_lengths.append(y_cut_length)
+                cut_lower, cut_upper = out_offset_, out_offset_ + y_cut_length
+                y_cut[i, :, :y_cut_length] = y_[:, cut_lower:cut_upper]
+                attn_cut[i, :, :y_cut_length] = attn[i, :, cut_lower:cut_upper]
+            y_cut_lengths = torch.LongTensor(y_cut_lengths)
+            y_cut_mask = sequence_mask(y_cut_lengths).unsqueeze(1).to(y_mask)
+            attn = attn_cut  # attn -> [B, text_length, cut_length]. It does not begin from top left corner
+            y = y_cut
+            y_mask = y_cut_mask
+        # Align encoded text with mel-spectrogram and get mu_y segment
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))  # here mu_x is not cut.
+        mu_y = mu_y.transpose(1, 2)  # B, 80, cut_length
+        # Compute loss of score-based decoder
+        # print(y.shape, y_mask.shape, mu_y.shape)
+        diff_loss, xt = self.decoder.compute_loss(y, y_mask, mu_y, spk)
+        # Compute loss between aligned encoder outputs and mel-spectrogram
+        prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask)
+        prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats)
+        return dur_loss, prior_loss, diff_loss
+class GradTTSXvector(BaseModule):
+    def __init__(self, n_vocab=148, spk_emb_dim=64,
+                 n_enc_channels=192, filter_channels=768, filter_channels_dp=256,
+                 n_heads=2, n_enc_layers=6, enc_kernel=3, enc_dropout=0.1, window_size=4,
+                 n_feats=80, dec_dim=64, beta_min=0.05, beta_max=20.0, pe_scale=1000, xvector_dim=512, **kwargs):
+        super(GradTTSXvector, self).__init__()
+        self.n_vocab = n_vocab
+        # self.n_spks = n_spks
+        self.spk_emb_dim = spk_emb_dim
+        self.n_enc_channels = n_enc_channels
+        self.filter_channels = filter_channels
+        self.filter_channels_dp = filter_channels_dp
+        self.n_heads = n_heads
+        self.n_enc_layers = n_enc_layers
+        self.enc_kernel = enc_kernel
+        self.enc_dropout = enc_dropout
+        self.window_size = window_size
+        self.n_feats = n_feats
+        self.dec_dim = dec_dim
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+        self.pe_scale = pe_scale
+        self.xvector_proj = torch.nn.Linear(xvector_dim, spk_emb_dim)
+        self.encoder = TextEncoder(n_vocab, n_feats, n_enc_channels,
+                                   filter_channels, filter_channels_dp, n_heads,
+                                   n_enc_layers, enc_kernel, enc_dropout, window_size,
+                                   spk_emb_dim=spk_emb_dim, n_spks=999)  # NOTE: not important `n_spk`
+        self.decoder = Diffusion(n_feats, dec_dim, spk_emb_dim, beta_min, beta_max, pe_scale)
+    @torch.no_grad()
+    def forward(self, x, x_lengths, n_timesteps, temperature=1.0, stoc=False, spk=None, length_scale=1.0):
+        """
+        Generates mel-spectrogram from text. Returns:
+            1. encoder outputs
+            2. decoder outputs
+            3. generated alignment
+        Args:
+            x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids.
+            x_lengths (torch.Tensor): lengths of texts in batch.
+            n_timesteps (int): number of steps to use for reverse diffusion in decoder.
+            temperature (float, optional): controls variance of terminal distribution.
+            stoc (bool, optional): flag that adds stochastic term to the decoder sampler.
+                Usually, does not provide synthesis improvements.
+            length_scale (float, optional): controls speech pace.
+                Increase value to slow down generated speech and vice versa.
+            spk: actually the xvectors
+        """
+        x, x_lengths = self.relocate_input([x, x_lengths])
+        spk = self.xvector_proj(spk)  # NOTE: use x-vectors instead of speaker embedding
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spk)
+        w = torch.exp(logw) * x_mask
+        w_ceil = torch.ceil(w) * length_scale
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_max_length = int(y_lengths.max())
+        y_max_length_ = fix_len_compatibility(y_max_length)
+        # Using obtained durations `w` construct alignment map `attn`
+        y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
+        # Align encoded text and get mu_y
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        encoder_outputs = mu_y[:, :, :y_max_length]
+        # Sample latent representation from terminal distribution N(mu_y, I)
+        z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature
+        # Generate sample by performing reverse dynamics
+        decoder_outputs = self.decoder(z, y_mask, mu_y, n_timesteps, stoc, spk)
+        decoder_outputs = decoder_outputs[:, :, :y_max_length]
+        return encoder_outputs, decoder_outputs, attn[:, :, :y_max_length]
+    def compute_loss(self, x, x_lengths, y, y_lengths, spk=None, out_size=None, use_gt_dur=False, durs=None):
+        """
+        Computes 3 losses:
+            1. duration loss: loss between predicted token durations and those extracted by Monotonic Alignment Search (MAS).
+            2. prior loss: loss between mel-spectrogram and encoder outputs.
+            3. diffusion loss: loss between gaussian noise and its reconstruction by diffusion-based decoder.
+        Args:
+            x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids.
+            x_lengths (torch.Tensor): lengths of texts in batch.
+            y (torch.Tensor): batch of corresponding mel-spectrograms.
+            y_lengths (torch.Tensor): lengths of mel-spectrograms in batch.
+            out_size (int, optional): length (in mel's sampling rate) of segment to cut, on which decoder will be trained.
+                Should be divisible by 2^{num of UNet downsamplings}. Needed to increase batch size.
+            spk: xvector
+            use_gt_dur: bool
+            durs: gt duration
+        """
+        x, x_lengths, y, y_lengths = self.relocate_input([x, x_lengths, y, y_lengths])
+        spk = self.xvector_proj(spk)  # NOTE: use x-vectors instead of speaker embedding
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spk)
+        y_max_length = y.shape[-1]
+        y_mask = sequence_mask(y_lengths, y_max_length).unsqueeze(1).to(x_mask)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        # Use MAS to find most likely alignment `attn` between text and mel-spectrogram
+        if not use_gt_dur:
+            with torch.no_grad():
+                const = -0.5 * math.log(2 * math.pi) * self.n_feats
+                factor = -0.5 * torch.ones(mu_x.shape, dtype=mu_x.dtype, device=mu_x.device)
+                y_square = torch.matmul(factor.transpose(1, 2), y ** 2)
+                y_mu_double = torch.matmul(2.0 * (factor * mu_x).transpose(1, 2), y)
+                mu_square = torch.sum(factor * (mu_x ** 2), 1).unsqueeze(-1)
+                log_prior = y_square - y_mu_double + mu_square + const
+                attn = monotonic_align.maximum_path(log_prior, attn_mask.squeeze(1))
+                attn = attn.detach()
+        else:
+            with torch.no_grad():
+                attn = generate_path(durs, attn_mask.squeeze(1)).detach()
+        # Compute loss between predicted log-scaled durations and those obtained from MAS
+        logw_ = torch.log(1e-8 + torch.sum(attn.unsqueeze(1), -1)) * x_mask
+        dur_loss = duration_loss(logw, logw_, x_lengths)
+        # print(attn.shape)
+        # Cut a small segment of mel-spectrogram in order to increase batch size
+        if not isinstance(out_size, type(None)):
+            max_offset = (y_lengths - out_size).clamp(0)
+            offset_ranges = list(zip([0] * max_offset.shape[0], max_offset.cpu().numpy()))
+            out_offset = torch.LongTensor([
+                torch.tensor(random.choice(range(start, end)) if end > start else 0)
+                for start, end in offset_ranges
+            ]).to(y_lengths)
+            attn_cut = torch.zeros(attn.shape[0], attn.shape[1], out_size, dtype=attn.dtype, device=attn.device)
+            y_cut = torch.zeros(y.shape[0], self.n_feats, out_size, dtype=y.dtype, device=y.device)
+            y_cut_lengths = []
+            for i, (y_, out_offset_) in enumerate(zip(y, out_offset)):
+                y_cut_length = out_size + (y_lengths[i] - out_size).clamp(None, 0)
+                y_cut_lengths.append(y_cut_length)
+                cut_lower, cut_upper = out_offset_, out_offset_ + y_cut_length
+                y_cut[i, :, :y_cut_length] = y_[:, cut_lower:cut_upper]
+                attn_cut[i, :, :y_cut_length] = attn[i, :, cut_lower:cut_upper]
+            y_cut_lengths = torch.LongTensor(y_cut_lengths)
+            y_cut_mask = sequence_mask(y_cut_lengths).unsqueeze(1).to(y_mask)
+            attn = attn_cut
+            y = y_cut
+            y_mask = y_cut_mask
+        # Align encoded text with mel-spectrogram and get mu_y segment
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        # Compute loss of score-based decoder
+        diff_loss, xt = self.decoder.compute_loss(y, y_mask, mu_y, spk)
+        # Compute loss between aligned encoder outputs and mel-spectrogram
+        prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask)
+        prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats)
+        return dur_loss, prior_loss, diff_loss

model/utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+""" from https://github.com/jaywalnut310/glow-tts """
+import torch
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(int(max_length), dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def fix_len_compatibility(length, num_downsamplings_in_unet=2):
+    while True:
+        if length % (2**num_downsamplings_in_unet) == 0:
+            return length
+        length += 1
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def generate_path(duration, mask):
+    device = duration.device
+    b, t_x, t_y = mask.shape
+    cum_duration = torch.cumsum(duration, 1)
+    path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0],
+                                          [1, 0], [0, 0]]))[:, :-1]
+    path = path * mask
+    return path
+def duration_loss(logw, logw_, lengths):
+    loss = torch.sum((logw - logw_)**2) / torch.sum(lengths)
+    return loss

models.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from xutils import init_weights, get_padding
+LRELU_SLOPE = 0.1
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorP(2),
+            DiscriminatorP(3),
+            DiscriminatorP(5),
+            DiscriminatorP(7),
+            DiscriminatorP(11),
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList([
+            AvgPool1d(4, 2, padding=2),
+            AvgPool1d(4, 2, padding=2)
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i-1](y)
+                y_hat = self.meanpools[i-1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss*2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1-dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1-dg)**2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses

text/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

text/LICENSE ADDED Viewed

	@@ -0,0 +1,30 @@

+CMUdict
+-------
+CMUdict (the Carnegie Mellon Pronouncing Dictionary) is a free
+pronouncing dictionary of English, suitable for uses in speech
+technology and is maintained by the Speech Group in the School of
+Computer Science at Carnegie Mellon University.
+The Carnegie Mellon Speech Group does not guarantee the accuracy of
+this dictionary, nor its suitability for any specific purpose. In
+fact, we expect a number of errors, omissions and inconsistencies to
+remain in the dictionary. We intend to continually update the
+dictionary by correction existing entries and by adding new ones. From
+time to time a new major version will be released.
+We welcome input from users: Please send email to Alex Rudnicky
+([email protected]).
+The Carnegie Mellon Pronouncing Dictionary, in its current and
+previous versions is Copyright (C) 1993-2014 by Carnegie Mellon
+University.  Use of this dictionary for any research or commercial
+purpose is completely unrestricted.  If you make use of or
+redistribute this material we request that you acknowledge its
+origin in your descriptions.
+If you add words to or correct words in your version of this
+dictionary, we would appreciate it if you could send these additions
+and corrections to us ([email protected]) for consideration in a
+subsequent version. All submissions will be reviewed and approved by
+the current maintainer, Alex Rudnicky at Carnegie Mellon.

text/__init__.py ADDED Viewed

	@@ -0,0 +1,106 @@

+""" from https://github.com/keithito/tacotron """
+import re
+from text import cleaners
+from text.symbols import symbols
+import torch
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
+def get_arpabet(word, dictionary):
+    word_arpabet = dictionary.lookup(word)
+    if word_arpabet is not None:
+        return "{" + word_arpabet[0] + "}"
+    else:
+        return word
+def text_to_sequence(text, cleaner_names=["kazakh_cleaners"], dictionary=None):
+    '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
+    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+      dictionary: arpabet class with arpabet dictionary
+    Returns:
+      List of integers corresponding to the symbols in the text
+    '''
+    sequence = []
+    space = _symbols_to_sequence(' ')
+    # Check for curly braces and treat their contents as ARPAbet:
+    while len(text):
+        m = _curly_re.match(text)
+        if not m:
+            clean_text = _clean_text(text, cleaner_names)
+            #clean_text = text
+            if dictionary is not None:
+                clean_text = [get_arpabet(w, dictionary) for w in clean_text.split(" ")]
+                for i in range(len(clean_text)):
+                    t = clean_text[i]
+                    if t.startswith("{"):
+                        sequence += _arpabet_to_sequence(t[1:-1])
+                    else:
+                        sequence += _symbols_to_sequence(t)
+                    sequence += space
+            else:
+                sequence += _symbols_to_sequence(clean_text)
+            break
+        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+        sequence += _arpabet_to_sequence(m.group(2))
+        text = m.group(3)
+    # remove trailing space
+    if dictionary is not None:
+        sequence = sequence[:-1] if sequence[-1] == space[0] else sequence
+    return sequence
+def sequence_to_text(sequence):
+    '''Converts a sequence of IDs back to a string'''
+    result = ''
+    for symbol_id in sequence:
+        if symbol_id in _id_to_symbol:
+            s = _id_to_symbol[symbol_id]
+            # Enclose ARPAbet back in curly braces:
+            if len(s) > 1 and s[0] == '@':
+                s = '{%s}' % s[1:]
+            result += s
+    return result.replace('}{', ' ')
+def convert_text(string):
+    text_norm = text_to_sequence(string.lower())
+    text_norm = torch.IntTensor(text_norm)
+    text_len = torch.IntTensor([text_norm.size(0)])
+    text_padded = torch.LongTensor(1, len(text_norm))
+    text_padded.zero_()
+    text_padded[0, :text_norm.size(0)] = text_norm
+    return text_padded, text_len
+def _clean_text(text, cleaner_names):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception('Unknown cleaner: %s' % name)
+        text = cleaner(text)
+    return text
+def _symbols_to_sequence(symbols):
+    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
+def _arpabet_to_sequence(text):
+    return _symbols_to_sequence(['@' + s for s in text.split()])
+def _should_keep_symbol(s):
+    return s in _symbol_to_id and s != '_' and s != '~'