text_encoder: !new:uetasr.featurizers.text.Subword | |
model_prefix: vocabs/subword_vietnamese_500 | |
data_path: transcript_v3.1.txt | |
character_coverage: 1.0 | |
model_type: bpe # word bpe unigram char | |
num_threads: 16 | |
unk_id: 1 | |
pad_id: 0 | |
eos_id: -1 | |
unk_piece: <unk> | |
pad_piece: <blank> | |
eos_piece: </s> | |
vocab_size: 500 | |
audio_encoder: !new:uetasr.featurizers.audio.LogMelSpectrogram | |
fs: 16000 | |
n_fft: 512 | |
win_length: 400 | |
hop_length: 160 | |
n_mels: 80 | |
fmin: 0 | |
fmax: 8000 | |
htk: False | |
d_model: 256 | |
encoder_model: !new:uetasr.models.encoders.Conformer | |
num_features: 80 | |
window_size: 1 | |
d_model: !ref <d_model> | |
input_layer: vgg2l | |
pos_enc_layer_type: rel_pos | |
dropout_rate_pos_enc: 0.2 | |
selfattention_layer_type: rel_selfattn | |
attention_heads: 4 | |
dropout_rate_att: 0.1 | |
dropout_rate_pos_wise: 0.1 | |
dropout_rate: 0.1 | |
positionwise_layer_type: linear | |
linear_units: 1024 | |
conv_mod_kernel: 31 | |
num_blocks: 18 | |
use_macaron: True | |
use_cnn_module: True | |
eps_layer_norm: 0.000000000001 | |
decoder_model: !new:uetasr.models.decoders.RNNDecoder | |
vocab_size: !ref <text_encoder.vocab_size> | |
embedding_dim: 256 | |
num_layers: 1 | |
hidden_dim: !ref <d_model> | |
dropout_embed: 0.2 | |
dropout_rnn: 0.1 | |
rnn_type: LSTM | |
jointer_model: !new:uetasr.layers.jointer.RNNTJointer | |
encoder_dim: !ref <d_model> | |
decoder_dim: !ref <d_model> | |
hidden_dim: 512 | |
output_dim: !ref <text_encoder.vocab_size> | |
ctc_lin: null | |
model: !new:uetasr.models.rnnt.RNNT | |
encoder: !ref <encoder_model> | |
decoder: !ref <decoder_model> | |
jointer: !ref <jointer_model> | |
ctc_lin: !ref <ctc_lin> | |
ctc_dropout: 0.1 | |
use_cmvn: True | |