# network architecture # encoder related encoder: conformer encoder_conf: output_size: 512 # dimension of attention attention_heads: 8 linear_units: 2048 # the number of units of position-wise feed forward num_blocks: 12 # the number of encoder blocks dropout_rate: 0.1 positional_dropout_rate: 0.0 attention_dropout_rate: 0.0 input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 normalize_before: true cnn_module_kernel: 31 use_cnn_module: True activation_type: 'swish' pos_enc_layer_type: 'rel_pos' selfattention_layer_type: 'rel_selfattn' # decoder related decoder: transformer decoder_conf: attention_heads: 8 linear_units: 2048 num_blocks: 6 dropout_rate: 0.1 positional_dropout_rate: 0.0 self_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0 # hybrid CTC/attention model_conf: ctc_weight: 1.0 lsm_weight: 0.1 # label smoothing option length_normalized_loss: false # use raw_wav or kaldi feature raw_wav: true # dataset related dataset_conf: filter_conf: max_length: 2000 min_length: 50 token_max_length: 400 token_min_length: 1 resample_conf: resample_rate: 16000 speed_perturb: false fbank_conf: num_mel_bins: 80 frame_shift: 10 frame_length: 25 dither: 1.0 spec_aug: false spec_aug_conf: num_t_mask: 3 num_f_mask: 2 max_t: 50 max_f: 10 shuffle: true shuffle_conf: shuffle_size: 1500 sort: true sort_conf: sort_size: 500 # sort_size should be less than shuffle_size batch_conf: batch_type: 'dynamic' # static or dynamic max_frames_in_batch: 20000 batch_size: 3 pretrain: True wav2vec_conf: pretrain: True quantize_targets: True project_targets: True latent_vars: 320 latent_dim: 512 latent_groups: 2 w2v_ext_loss: True w2v_loss_weights: [1.5,0] mask: True mask_prob: 0.65 grad_clip: 5 accum_grad: 4 max_epoch: 280 log_interval: 100 optim: adam optim_conf: lr: 0.002 scheduler: warmuplr # pytorch v1.1.0+ required scheduler_conf: warmup_steps: 25000