TTSDemoApp / hyperparams.yaml
myhanhhyugen's picture
initial commits
dc9eaa3 verified
############################################################################
# Model: TTS with attention-based mechanism
# Tokens: g2p + possitional embeddings
# losses: MSE & BCE
# Training: LJSpeech
# ############################################################################
###################################
# Experiment Parameters and setup #
###################################
seed: 1234
__set_seed: !apply:torch.manual_seed [!ref <seed>]
# Folder set up
# output_folder: !ref .\\results\\tts\\<seed>
# save_folder: !ref <output_folder>\\save
output_folder: !ref ./results/<seed>
save_folder: !ref <output_folder>/save
################################
# Model Parameters and model #
################################
# Input parameters
lexicon:
- AA
- AE
- AH
- AO
- AW
- AY
- B
- CH
- D
- DH
- EH
- ER
- EY
- F
- G
- HH
- IH
- IY
- JH
- K
- L
- M
- N
- NG
- OW
- OY
- P
- R
- S
- SH
- T
- TH
- UH
- UW
- V
- W
- Y
- Z
- ZH
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder
################################
# Model Parameters and model #
# Transformer Parameters
################################
d_model: 512
nhead: 8
num_encoder_layers: 3
num_decoder_layers: 3
dim_feedforward: 512
dropout: 0.1
# Decoder parameters
# The number of frames in the target per encoder step
n_frames_per_step: 1
decoder_rnn_dim: 1024
prenet_dim: 256
max_decoder_steps: 1000
gate_threshold: 0.5
p_decoder_dropout: 0.1
decoder_no_early_stopping: False
blank_index: 0 # This special tokes is for padding
# Masks
lookahead_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_lookahead_mask
padding_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_key_padding_mask
################################
# CNN 3-layers Prenet #
################################
# Encoder Prenet
encoder_prenet: !new:module_classes.CNNPrenet
# Decoder Prenet
decoder_prenet: !new:module_classes.CNNDecoderPrenet
################################
# Positional Encodings #
################################
#encoder
pos_emb_enc: !new:module_classes.ScaledPositionalEncoding
input_size: !ref <d_model>
max_len: 5000
#decoder
pos_emb_dec: !new:module_classes.ScaledPositionalEncoding
input_size: !ref <d_model>
max_len: 5000
################################
# S2S Transfomer #
################################
Seq2SeqTransformer: !new:torch.nn.Transformer
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
dim_feedforward: !ref <dim_feedforward>
dropout: !ref <dropout>
batch_first: True
################################
# CNN 5-layers PostNet #
################################
decoder_postnet: !new:speechbrain.lobes.models.Tacotron2.Postnet
# Linear transformation on the top of the decoder.
stop_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: 1
# Linear transformation on the top of the decoder.
mel_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: 80
modules:
encoder_prenet: !ref <encoder_prenet>
pos_emb_enc: !ref <pos_emb_enc>
decoder_prenet: !ref <decoder_prenet>
pos_emb_dec: !ref <pos_emb_dec>
Seq2SeqTransformer: !ref <Seq2SeqTransformer>
mel_lin: !ref <mel_lin>
stop_lin: !ref <stop_lin>
decoder_postnet: !ref <decoder_postnet>
model: !new:torch.nn.ModuleList
- [!ref <encoder_prenet>,!ref <pos_emb_enc>,
!ref <decoder_prenet>, !ref <pos_emb_dec>, !ref <Seq2SeqTransformer>,
!ref <mel_lin>, !ref <stop_lin>, !ref <decoder_postnet>]
pretrained_model_path: ./model.ckpt
# The pretrainer allows a mapping between pretrained files and instances that
# are declared in the yaml. E.g here, we will download the file model.ckpt
# and it will be loaded into "model" which is pointing to the <model> defined
# before.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
collect_in: !ref <save_folder>
loadables:
model: !ref <model>
paths:
model: !ref <pretrained_model_path>