|
|
|
|
|
|
|
|
|
|
|
sample_rate: 22050 |
|
hop_length: 256 |
|
win_length: 1024 |
|
n_mel_channels: 80 |
|
n_fft: 1024 |
|
mel_fmin: 0.0 |
|
mel_fmax: 8000.0 |
|
power: 1 |
|
normalized: False |
|
min_max_energy_norm: True |
|
norm: "slaney" |
|
mel_scale: "slaney" |
|
dynamic_range_compression: True |
|
mel_normalized: False |
|
min_f0: 65 |
|
max_f0: 2093 |
|
|
|
positive_weight: 5.0 |
|
lexicon: |
|
- AA |
|
- AE |
|
- AH |
|
- AO |
|
- AW |
|
- AY |
|
- B |
|
- CH |
|
- D |
|
- DH |
|
- EH |
|
- ER |
|
- EY |
|
- F |
|
- G |
|
- HH |
|
- IH |
|
- IY |
|
- JH |
|
- K |
|
- L |
|
- M |
|
- N |
|
- NG |
|
- OW |
|
- OY |
|
- P |
|
- R |
|
- S |
|
- SH |
|
- T |
|
- TH |
|
- UH |
|
- UW |
|
- V |
|
- W |
|
- Y |
|
- Z |
|
- ZH |
|
- ' ' |
|
n_symbols: 42 |
|
padding_idx: 0 |
|
|
|
|
|
d_model: 512 |
|
nhead: 8 |
|
num_encoder_layers: 6 |
|
num_decoder_layers: 6 |
|
dim_feedforward: 2048 |
|
dropout: 0.2 |
|
blank_index: 0 |
|
bos_index: 1 |
|
eos_index: 2 |
|
stop_weight: 0.45 |
|
stop_threshold: 0.5 |
|
|
|
|
|
|
|
enc_pre_net: !new:models.EncoderPrenet |
|
dec_pre_net: !new:models.DecoderPrenet |
|
|
|
|
|
encoder_emb: !new:torch.nn.Embedding |
|
num_embeddings: 128 |
|
embedding_dim: !ref <d_model> |
|
padding_idx: !ref <blank_index> |
|
|
|
pos_emb_enc: !new:models.ScaledPositionalEncoding |
|
d_model: !ref <d_model> |
|
|
|
decoder_emb: !new:torch.nn.Embedding |
|
num_embeddings: 128 |
|
embedding_dim: !ref <d_model> |
|
padding_idx: !ref <blank_index> |
|
|
|
pos_emb_dec: !new:models.ScaledPositionalEncoding |
|
d_model: !ref <d_model> |
|
|
|
|
|
Seq2SeqTransformer: !new:torch.nn.Transformer |
|
d_model: !ref <d_model> |
|
nhead: !ref <nhead> |
|
num_encoder_layers: !ref <num_encoder_layers> |
|
num_decoder_layers: !ref <num_decoder_layers> |
|
dim_feedforward: !ref <dim_feedforward> |
|
dropout: !ref <dropout> |
|
batch_first: True |
|
|
|
postnet: !new:models.PostNet |
|
mel_channels: !ref <n_mel_channels> |
|
postnet_channels: 512 |
|
kernel_size: 5 |
|
postnet_layers: 5 |
|
|
|
mel_lin: !new:speechbrain.nnet.linear.Linear |
|
input_size: !ref <d_model> |
|
n_neurons: !ref <n_mel_channels> |
|
|
|
stop_lin: !new:speechbrain.nnet.linear.Linear |
|
input_size: !ref <d_model> |
|
n_neurons: 1 |
|
|
|
mel_spec_feats: !name:speechbrain.lobes.models.FastSpeech2.mel_spectogram |
|
sample_rate: !ref <sample_rate> |
|
hop_length: !ref <hop_length> |
|
win_length: !ref <win_length> |
|
n_fft: !ref <n_fft> |
|
n_mels: !ref <n_mel_channels> |
|
f_min: !ref <mel_fmin> |
|
f_max: !ref <mel_fmax> |
|
power: !ref <power> |
|
normalized: !ref <normalized> |
|
min_max_energy_norm: !ref <min_max_energy_norm> |
|
norm: !ref <norm> |
|
mel_scale: !ref <mel_scale> |
|
compression: !ref <dynamic_range_compression> |
|
|
|
modules: |
|
enc_pre_net: !ref <enc_pre_net> |
|
encoder_emb: !ref <encoder_emb> |
|
pos_emb_enc: !ref <pos_emb_enc> |
|
|
|
dec_pre_net: !ref <dec_pre_net> |
|
|
|
pos_emb_dec: !ref <pos_emb_dec> |
|
|
|
Seq2SeqTransformer: !ref <Seq2SeqTransformer> |
|
postnet: !ref <postnet> |
|
mel_lin: !ref <mel_lin> |
|
stop_lin: !ref <stop_lin> |
|
model: !ref <model> |
|
|
|
lookahead_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_lookahead_mask |
|
padding_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_key_padding_mask |
|
|
|
model: !new:torch.nn.ModuleList |
|
- [!ref <enc_pre_net>, !ref <encoder_emb>, !ref <pos_emb_enc>, !ref <dec_pre_net>, !ref <pos_emb_dec>, !ref <Seq2SeqTransformer>, !ref <postnet>, !ref <mel_lin>, !ref <stop_lin>] |
|
|
|
label_encoder: !new:speechbrain.dataio.encoder.TextEncoder |
|
|
|
pretrained_path: /content/ |
|
|
|
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
|
loadables: |
|
model: !ref <model> |
|
label_encoder: !ref <label_encoder> |
|
paths: |
|
model: !ref <pretrained_path>/model.ckpt |
|
label_encoder: !ref <pretrained_path>/label_encoder.txt |
|
|