TTS / hyperparams.yaml
Krisshvamsi's picture
Upload 3 files
ce88638 verified
raw
history blame
4.01 kB
################################
# Audio Parameters #
################################
sample_rate: 22050
hop_length: 256
win_length: 1024
n_mel_channels: 80
n_fft: 1024
mel_fmin: 0.0
mel_fmax: 8000.0
power: 1
normalized: False
min_max_energy_norm: True
norm: "slaney"
mel_scale: "slaney"
dynamic_range_compression: True
mel_normalized: False
min_f0: 65 #(torchaudio pyin values)
max_f0: 2093 #(torchaudio pyin values)
positive_weight: 5.0
lexicon:
- AA
- AE
- AH
- AO
- AW
- AY
- B
- CH
- D
- DH
- EH
- ER
- EY
- F
- G
- HH
- IH
- IY
- JH
- K
- L
- M
- N
- NG
- OW
- OY
- P
- R
- S
- SH
- T
- TH
- UH
- UW
- V
- W
- Y
- Z
- ZH
- ' '
n_symbols: 42 #fixed depending on symbols in the lexicon +1 for a dummy symbol used for padding
padding_idx: 0
# Define model architecture
d_model: 512
nhead: 8
num_encoder_layers: 6
num_decoder_layers: 6
dim_feedforward: 2048
dropout: 0.2
blank_index: 0 # This special token is for padding
bos_index: 1
eos_index: 2
stop_weight: 0.45
stop_threshold: 0.5
###################PRENET#######################
enc_pre_net: !new:models.EncoderPrenet
dec_pre_net: !new:models.DecoderPrenet
encoder_emb: !new:torch.nn.Embedding
num_embeddings: 128
embedding_dim: !ref <d_model>
padding_idx: !ref <blank_index>
pos_emb_enc: !new:models.ScaledPositionalEncoding
d_model: !ref <d_model>
decoder_emb: !new:torch.nn.Embedding
num_embeddings: 128
embedding_dim: !ref <d_model>
padding_idx: !ref <blank_index>
pos_emb_dec: !new:models.ScaledPositionalEncoding
d_model: !ref <d_model>
Seq2SeqTransformer: !new:torch.nn.Transformer
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
dim_feedforward: !ref <dim_feedforward>
dropout: !ref <dropout>
batch_first: True
postnet: !new:models.PostNet
mel_channels: !ref <n_mel_channels>
postnet_channels: 512
kernel_size: 5
postnet_layers: 5
mel_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <n_mel_channels>
stop_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: 1
mel_spec_feats: !name:speechbrain.lobes.models.FastSpeech2.mel_spectogram
sample_rate: !ref <sample_rate>
hop_length: !ref <hop_length>
win_length: !ref <win_length>
n_fft: !ref <n_fft>
n_mels: !ref <n_mel_channels>
f_min: !ref <mel_fmin>
f_max: !ref <mel_fmax>
power: !ref <power>
normalized: !ref <normalized>
min_max_energy_norm: !ref <min_max_energy_norm>
norm: !ref <norm>
mel_scale: !ref <mel_scale>
compression: !ref <dynamic_range_compression>
modules:
enc_pre_net: !ref <enc_pre_net>
encoder_emb: !ref <encoder_emb>
pos_emb_enc: !ref <pos_emb_enc>
dec_pre_net: !ref <dec_pre_net>
#decoder_emb: !ref <decoder_emb>
pos_emb_dec: !ref <pos_emb_dec>
Seq2SeqTransformer: !ref <Seq2SeqTransformer>
postnet: !ref <postnet>
mel_lin: !ref <mel_lin>
stop_lin: !ref <stop_lin>
model: !ref <model>
lookahead_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_lookahead_mask
padding_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_key_padding_mask
model: !new:torch.nn.ModuleList
- [!ref <enc_pre_net>, !ref <encoder_emb>, !ref <pos_emb_enc>, !ref <dec_pre_net>, !ref <pos_emb_dec>, !ref <Seq2SeqTransformer>, !ref <postnet>, !ref <mel_lin>, !ref <stop_lin>]
label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
pretrained_path: /content/
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
model: !ref <model>
label_encoder: !ref <label_encoder>
paths:
model: !ref <pretrained_path>/model.ckpt
label_encoder: !ref <pretrained_path>/label_encoder.txt