uetasr-conformer_30.3m / config.yaml
thanhtvt
remove decoder from config file
6899257
text_encoder: !new:uetasr.featurizers.text.Subword
model_prefix: vocabs/subword_vietnamese_500
data_path: transcript_v3.1.txt
character_coverage: 1.0
model_type: bpe # word bpe unigram char
num_threads: 16
unk_id: 1
pad_id: 0
eos_id: -1
unk_piece: <unk>
pad_piece: <blank>
eos_piece: </s>
vocab_size: 500
audio_encoder: !new:uetasr.featurizers.audio.LogMelSpectrogram
fs: 16000
n_fft: 512
win_length: 400
hop_length: 160
n_mels: 80
fmin: 0
fmax: 8000
htk: False
d_model: 256
encoder_model: !new:uetasr.models.encoders.Conformer
num_features: 80
window_size: 1
d_model: !ref <d_model>
input_layer: vgg2l
pos_enc_layer_type: rel_pos
dropout_rate_pos_enc: 0.2
selfattention_layer_type: rel_selfattn
attention_heads: 4
dropout_rate_att: 0.1
dropout_rate_pos_wise: 0.1
dropout_rate: 0.1
positionwise_layer_type: linear
linear_units: 1024
conv_mod_kernel: 31
num_blocks: 18
use_macaron: True
use_cnn_module: True
eps_layer_norm: 0.000000000001
decoder_model: !new:uetasr.models.decoders.RNNDecoder
vocab_size: !ref <text_encoder.vocab_size>
embedding_dim: 256
num_layers: 1
hidden_dim: !ref <d_model>
dropout_embed: 0.2
dropout_rnn: 0.1
rnn_type: LSTM
jointer_model: !new:uetasr.layers.jointer.RNNTJointer
encoder_dim: !ref <d_model>
decoder_dim: !ref <d_model>
hidden_dim: 512
output_dim: !ref <text_encoder.vocab_size>
ctc_lin: null
model: !new:uetasr.models.rnnt.RNNT
encoder: !ref <encoder_model>
decoder: !ref <decoder_model>
jointer: !ref <jointer_model>
ctc_lin: !ref <ctc_lin>
ctc_dropout: 0.1
use_cmvn: True