File size: 4,014 Bytes
ce88638 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
################################
# Audio Parameters #
################################
sample_rate: 22050
hop_length: 256
win_length: 1024
n_mel_channels: 80
n_fft: 1024
mel_fmin: 0.0
mel_fmax: 8000.0
power: 1
normalized: False
min_max_energy_norm: True
norm: "slaney"
mel_scale: "slaney"
dynamic_range_compression: True
mel_normalized: False
min_f0: 65 #(torchaudio pyin values)
max_f0: 2093 #(torchaudio pyin values)
positive_weight: 5.0
lexicon:
- AA
- AE
- AH
- AO
- AW
- AY
- B
- CH
- D
- DH
- EH
- ER
- EY
- F
- G
- HH
- IH
- IY
- JH
- K
- L
- M
- N
- NG
- OW
- OY
- P
- R
- S
- SH
- T
- TH
- UH
- UW
- V
- W
- Y
- Z
- ZH
- ' '
n_symbols: 42 #fixed depending on symbols in the lexicon +1 for a dummy symbol used for padding
padding_idx: 0
# Define model architecture
d_model: 512
nhead: 8
num_encoder_layers: 6
num_decoder_layers: 6
dim_feedforward: 2048
dropout: 0.2
blank_index: 0 # This special token is for padding
bos_index: 1
eos_index: 2
stop_weight: 0.45
stop_threshold: 0.5
###################PRENET#######################
enc_pre_net: !new:models.EncoderPrenet
dec_pre_net: !new:models.DecoderPrenet
encoder_emb: !new:torch.nn.Embedding
num_embeddings: 128
embedding_dim: !ref <d_model>
padding_idx: !ref <blank_index>
pos_emb_enc: !new:models.ScaledPositionalEncoding
d_model: !ref <d_model>
decoder_emb: !new:torch.nn.Embedding
num_embeddings: 128
embedding_dim: !ref <d_model>
padding_idx: !ref <blank_index>
pos_emb_dec: !new:models.ScaledPositionalEncoding
d_model: !ref <d_model>
Seq2SeqTransformer: !new:torch.nn.Transformer
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
dim_feedforward: !ref <dim_feedforward>
dropout: !ref <dropout>
batch_first: True
postnet: !new:models.PostNet
mel_channels: !ref <n_mel_channels>
postnet_channels: 512
kernel_size: 5
postnet_layers: 5
mel_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <n_mel_channels>
stop_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: 1
mel_spec_feats: !name:speechbrain.lobes.models.FastSpeech2.mel_spectogram
sample_rate: !ref <sample_rate>
hop_length: !ref <hop_length>
win_length: !ref <win_length>
n_fft: !ref <n_fft>
n_mels: !ref <n_mel_channels>
f_min: !ref <mel_fmin>
f_max: !ref <mel_fmax>
power: !ref <power>
normalized: !ref <normalized>
min_max_energy_norm: !ref <min_max_energy_norm>
norm: !ref <norm>
mel_scale: !ref <mel_scale>
compression: !ref <dynamic_range_compression>
modules:
enc_pre_net: !ref <enc_pre_net>
encoder_emb: !ref <encoder_emb>
pos_emb_enc: !ref <pos_emb_enc>
dec_pre_net: !ref <dec_pre_net>
#decoder_emb: !ref <decoder_emb>
pos_emb_dec: !ref <pos_emb_dec>
Seq2SeqTransformer: !ref <Seq2SeqTransformer>
postnet: !ref <postnet>
mel_lin: !ref <mel_lin>
stop_lin: !ref <stop_lin>
model: !ref <model>
lookahead_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_lookahead_mask
padding_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_key_padding_mask
model: !new:torch.nn.ModuleList
- [!ref <enc_pre_net>, !ref <encoder_emb>, !ref <pos_emb_enc>, !ref <dec_pre_net>, !ref <pos_emb_dec>, !ref <Seq2SeqTransformer>, !ref <postnet>, !ref <mel_lin>, !ref <stop_lin>]
label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
pretrained_path: /content/
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
model: !ref <model>
label_encoder: !ref <label_encoder>
paths:
model: !ref <pretrained_path>/model.ckpt
label_encoder: !ref <pretrained_path>/label_encoder.txt
|