Spaces:
Paused
Paused
############################################################################ | |
# Model: TTS with attention-based mechanism | |
# Tokens: g2p + possitional embeddings | |
# losses: MSE & BCE | |
# Training: LJSpeech | |
# ############################################################################ | |
################################### | |
# Experiment Parameters and setup # | |
################################### | |
seed: 1234 | |
__set_seed: !apply:torch.manual_seed [!ref <seed>] | |
# Folder set up | |
# output_folder: !ref .\\results\\tts\\<seed> | |
# save_folder: !ref <output_folder>\\save | |
output_folder: !ref ./results/<seed> | |
save_folder: !ref <output_folder>/save | |
################################ | |
# Model Parameters and model # | |
################################ | |
# Input parameters | |
lexicon: | |
- AA | |
- AE | |
- AH | |
- AO | |
- AW | |
- AY | |
- B | |
- CH | |
- D | |
- DH | |
- EH | |
- ER | |
- EY | |
- F | |
- G | |
- HH | |
- IH | |
- IY | |
- JH | |
- K | |
- L | |
- M | |
- N | |
- NG | |
- OW | |
- OY | |
- P | |
- R | |
- S | |
- SH | |
- T | |
- TH | |
- UH | |
- UW | |
- V | |
- W | |
- Y | |
- Z | |
- ZH | |
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder | |
################################ | |
# Model Parameters and model # | |
# Transformer Parameters | |
################################ | |
d_model: 512 | |
nhead: 8 | |
num_encoder_layers: 3 | |
num_decoder_layers: 3 | |
dim_feedforward: 512 | |
dropout: 0.1 | |
# Decoder parameters | |
# The number of frames in the target per encoder step | |
n_frames_per_step: 1 | |
decoder_rnn_dim: 1024 | |
prenet_dim: 256 | |
max_decoder_steps: 1000 | |
gate_threshold: 0.5 | |
p_decoder_dropout: 0.1 | |
decoder_no_early_stopping: False | |
blank_index: 0 # This special tokes is for padding | |
# Masks | |
lookahead_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_lookahead_mask | |
padding_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_key_padding_mask | |
################################ | |
# CNN 3-layers Prenet # | |
################################ | |
# Encoder Prenet | |
encoder_prenet: !new:module_classes.CNNPrenet | |
# Decoder Prenet | |
decoder_prenet: !new:module_classes.CNNDecoderPrenet | |
################################ | |
# Positional Encodings # | |
################################ | |
#encoder | |
pos_emb_enc: !new:module_classes.ScaledPositionalEncoding | |
input_size: !ref <d_model> | |
max_len: 5000 | |
#decoder | |
pos_emb_dec: !new:module_classes.ScaledPositionalEncoding | |
input_size: !ref <d_model> | |
max_len: 5000 | |
################################ | |
# S2S Transfomer # | |
################################ | |
Seq2SeqTransformer: !new:torch.nn.Transformer | |
d_model: !ref <d_model> | |
nhead: !ref <nhead> | |
num_encoder_layers: !ref <num_encoder_layers> | |
num_decoder_layers: !ref <num_decoder_layers> | |
dim_feedforward: !ref <dim_feedforward> | |
dropout: !ref <dropout> | |
batch_first: True | |
################################ | |
# CNN 5-layers PostNet # | |
################################ | |
decoder_postnet: !new:speechbrain.lobes.models.Tacotron2.Postnet | |
# Linear transformation on the top of the decoder. | |
stop_lin: !new:speechbrain.nnet.linear.Linear | |
input_size: !ref <d_model> | |
n_neurons: 1 | |
# Linear transformation on the top of the decoder. | |
mel_lin: !new:speechbrain.nnet.linear.Linear | |
input_size: !ref <d_model> | |
n_neurons: 80 | |
modules: | |
encoder_prenet: !ref <encoder_prenet> | |
pos_emb_enc: !ref <pos_emb_enc> | |
decoder_prenet: !ref <decoder_prenet> | |
pos_emb_dec: !ref <pos_emb_dec> | |
Seq2SeqTransformer: !ref <Seq2SeqTransformer> | |
mel_lin: !ref <mel_lin> | |
stop_lin: !ref <stop_lin> | |
decoder_postnet: !ref <decoder_postnet> | |
model: !new:torch.nn.ModuleList | |
- [!ref <encoder_prenet>,!ref <pos_emb_enc>, | |
!ref <decoder_prenet>, !ref <pos_emb_dec>, !ref <Seq2SeqTransformer>, | |
!ref <mel_lin>, !ref <stop_lin>, !ref <decoder_postnet>] | |
pretrained_model_path: ./model.ckpt | |
# The pretrainer allows a mapping between pretrained files and instances that | |
# are declared in the yaml. E.g here, we will download the file model.ckpt | |
# and it will be loaded into "model" which is pointing to the <model> defined | |
# before. | |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer | |
collect_in: !ref <save_folder> | |
loadables: | |
model: !ref <model> | |
paths: | |
model: !ref <pretrained_model_path> | |