Spaces:
Running
Running
# This EXPERIMENTAL configuration is for ESPnet2 to finetune | |
# Conformer FastSpeech2 + HiFiGAN vocoder jointly. To run | |
# this config, you need to specify "--tts_task gan_tts" | |
# option for tts.sh at least and use 22050 hz audio as the | |
# training data (mainly tested on LJspeech). | |
# This configuration tested on 4 GPUs with 12GB GPU memory. | |
# It takes around less than 1 week to finish the training but | |
# 100k iters model should generate reasonable results. | |
# YOU NEED TO MODIFY THE "*_params" AND "init_param" SECTIONS | |
# IF YOU WANT TO USE YOUR OWN PRETRAINED MODLES. | |
########################################################## | |
# TTS MODEL SETTING # | |
########################################################## | |
tts: joint_text2wav | |
tts_conf: | |
# copied from pretrained model's config.yaml | |
text2mel_type: tacotron2 | |
text2mel_params: | |
embed_dim: 512 # char or phn embedding dimension | |
elayers: 1 # number of blstm layers in encoder | |
eunits: 512 # number of blstm units | |
econv_layers: 3 # number of convolutional layers in encoder | |
econv_chans: 512 # number of channels in convolutional layer | |
econv_filts: 5 # filter size of convolutional layer | |
atype: location # attention function type | |
adim: 512 # attention dimension | |
aconv_chans: 32 # number of channels in convolutional layer of attention | |
aconv_filts: 15 # filter size of convolutional layer of attention | |
cumulate_att_w: true # whether to cumulate attention weight | |
dlayers: 2 # number of lstm layers in decoder | |
dunits: 1024 # number of lstm units in decoder | |
prenet_layers: 2 # number of layers in prenet | |
prenet_units: 256 # number of units in prenet | |
postnet_layers: 5 # number of layers in postnet | |
postnet_chans: 512 # number of channels in postnet | |
postnet_filts: 5 # filter size of postnet layer | |
output_activation: null # activation function for the final output | |
use_batch_norm: true # whether to use batch normalization in encoder | |
use_concate: true # whether to concatenate encoder embedding with decoder outputs | |
use_residual: false # whether to use residual connection in encoder | |
spk_embed_dim: 192 # speaker embedding dimension | |
spk_embed_integration_type: add # how to integrate speaker embedding | |
dropout_rate: 0.5 # dropout rate | |
zoneout_rate: 0.1 # zoneout rate | |
reduction_factor: 1 # reduction factor | |
use_masking: true # whether to apply masking for padded part in loss calculation | |
bce_pos_weight: 10.0 # weight of positive sample in binary cross entropy calculation | |
use_guided_attn_loss: true # whether to use guided attention loss | |
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss | |
guided_attn_loss_lambda: 1.0 # strength of guided attention loss | |
# copied from pretrained vocoder's config.yaml | |
vocoder_type: hifigan_generator | |
vocoder_params: | |
bias: true | |
channels: 512 | |
in_channels: 80 | |
kernel_size: 7 | |
nonlinear_activation: LeakyReLU | |
nonlinear_activation_params: | |
negative_slope: 0.1 | |
out_channels: 1 | |
resblock_dilations: | |
- - 1 | |
- 3 | |
- 5 | |
- - 1 | |
- 3 | |
- 5 | |
- - 1 | |
- 3 | |
- 5 | |
resblock_kernel_sizes: | |
- 3 | |
- 7 | |
- 11 | |
upsample_kernel_sizes: | |
- 16 | |
- 16 | |
- 4 | |
- 4 | |
upsample_scales: | |
- 8 | |
- 8 | |
- 2 | |
- 2 | |
use_additional_convs: true | |
use_weight_norm: true | |
# copied from pretrained vocoder's config.yaml | |
discriminator_type: hifigan_multi_scale_multi_period_discriminator | |
discriminator_params: | |
follow_official_norm: true | |
period_discriminator_params: | |
bias: true | |
channels: 32 | |
downsample_scales: | |
- 3 | |
- 3 | |
- 3 | |
- 3 | |
- 1 | |
in_channels: 1 | |
kernel_sizes: | |
- 5 | |
- 3 | |
max_downsample_channels: 1024 | |
nonlinear_activation: LeakyReLU | |
nonlinear_activation_params: | |
negative_slope: 0.1 | |
out_channels: 1 | |
use_spectral_norm: false | |
use_weight_norm: true | |
periods: | |
- 2 | |
- 3 | |
- 5 | |
- 7 | |
- 11 | |
scale_discriminator_params: | |
bias: true | |
channels: 128 | |
downsample_scales: | |
- 4 | |
- 4 | |
- 4 | |
- 4 | |
- 1 | |
in_channels: 1 | |
kernel_sizes: | |
- 15 | |
- 41 | |
- 5 | |
- 3 | |
max_downsample_channels: 1024 | |
max_groups: 16 | |
nonlinear_activation: LeakyReLU | |
nonlinear_activation_params: | |
negative_slope: 0.1 | |
out_channels: 1 | |
scale_downsample_pooling: AvgPool1d | |
scale_downsample_pooling_params: | |
kernel_size: 4 | |
padding: 2 | |
stride: 2 | |
scales: 3 | |
# loss function related | |
generator_adv_loss_params: | |
average_by_discriminators: false # whether to average loss value by #discriminators | |
loss_type: mse # loss type, "mse" or "hinge" | |
discriminator_adv_loss_params: | |
average_by_discriminators: false # whether to average loss value by #discriminators | |
loss_type: mse # loss type, "mse" or "hinge" | |
use_feat_match_loss: true # whether to use feat match loss | |
feat_match_loss_params: | |
average_by_discriminators: false # whether to average loss value by #discriminators | |
average_by_layers: false # whether to average loss value by #layers of each discriminator | |
include_final_outputs: true # whether to include final outputs for loss calculation | |
use_mel_loss: true # whether to use mel-spectrogram loss | |
mel_loss_params: | |
fs: 22050 # must be the same as the training data | |
n_fft: 1024 # fft points | |
hop_length: 256 # hop size | |
win_length: null # window length | |
window: hann # window type | |
n_mels: 80 # number of Mel basis | |
fmin: 0 # minimum frequency for Mel basis | |
fmax: null # maximum frequency for Mel basis | |
log_base: null # null represent natural log | |
lambda_text2mel: 1.0 # loss scaling coefficient for text2mel loss | |
lambda_adv: 1.0 # loss scaling coefficient for adversarial loss | |
lambda_mel: 45.0 # loss scaling coefficient for Mel loss | |
lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss | |
# others | |
sampling_rate: 22050 # needed in the inference for saving wav | |
segment_size: 32 # segment size for random windowed discriminator | |
cache_generator_outputs: true # whether to cache generator outputs in the training | |
# extra module for additional inputs | |
#pitch_extract: dio # pitch extractor type | |
#pitch_extract_conf: | |
# reduction_factor: 1 | |
#pitch_normalize: global_mvn # normalizer for the pitch feature | |
#energy_extract: energy # energy extractor type | |
#energy_extract_conf: | |
# reduction_factor: 1 | |
#energy_normalize: global_mvn # normalizer for the energy feature | |
# initialization (might need to modify for your own pretrained model) | |
init_param: | |
- exp/22k/tts_train_tacotron2_raw_char/train.loss.ave_5best.pth:tts:tts.generator.text2mel | |
- exp/22k/ljspeech_hifigan.v1/generator.pth::tts.generator.vocoder | |
- exp/22k/ljspeech_hifigan.v1/discriminator.pth::tts.discriminator | |
########################################################## | |
# OPTIMIZER & SCHEDULER SETTING # | |
########################################################## | |
# optimizer setting for generator | |
optim: adam | |
optim_conf: | |
lr: 1.25e-5 | |
betas: [0.5, 0.9] | |
weight_decay: 0.0 | |
scheduler: exponentiallr | |
scheduler_conf: | |
gamma: 0.999875 | |
# optimizer setting for discriminator | |
optim2: adam | |
optim2_conf: | |
lr: 1.25e-5 | |
betas: [0.5, 0.9] | |
weight_decay: 0.0 | |
scheduler2: exponentiallr | |
scheduler2_conf: | |
gamma: 0.999875 | |
generator_first: true # whether to start updating generator first | |
########################################################## | |
# OTHER TRAINING SETTING # | |
########################################################## | |
#num_iters_per_epoch: 1000 # number of iterations per epoch | |
max_epoch: 500 # number of epochs | |
accum_grad: 1 # gradient accumulation | |
batch_bins: 1600000 # batch bins (feats_type=raw) | |
batch_type: numel # how to make batch | |
grad_clip: -1 # gradient clipping norm | |
grad_noise: false # whether to use gradient noise injection | |
sort_in_batch: descending # how to sort data in making batch | |
sort_batch: descending # how to sort created batches | |
num_workers: 4 # number of workers of data loader | |
use_amp: false # whether to use pytorch amp | |
log_interval: 50 # log interval in iterations | |
keep_nbest_models: 5 # number of models to keep | |
num_att_plot: 3 # number of attention figures to be saved in every check | |
seed: 777 # random seed number | |
patience: null # patience for early stopping | |
unused_parameters: true # needed for multi gpu case | |
best_model_criterion: # criterion to save the best models | |
- - valid | |
- text2mel_loss | |
- min | |
- - train | |
- text2mel_loss | |
- min | |
- - train | |
- total_count | |
- max | |
cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic | |
# in the case of GAN-TTS training, we strongly recommend setting to false | |
cudnn_benchmark: false # setting to true might acdelerate the training speed but sometimes decrease it | |
# therefore, we set to false as a default (recommend trying both cases) | |