File size: 2,257 Bytes
9edce30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# network architecture
# encoder related
encoder: conformer
encoder_conf:
    output_size: 512    # dimension of attention
    attention_heads: 8
    linear_units: 2048  # the number of units of position-wise feed forward
    num_blocks: 12      # the number of encoder blocks
    dropout_rate: 0.1
    positional_dropout_rate: 0.0
    attention_dropout_rate: 0.0
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
    cnn_module_kernel: 31
    use_cnn_module: True
    activation_type: 'swish'
    pos_enc_layer_type: 'rel_pos'
    selfattention_layer_type: 'rel_selfattn'

# decoder related
decoder: transformer
decoder_conf:
    attention_heads: 8
    linear_units: 2048
    num_blocks: 6
    dropout_rate: 0.1
    positional_dropout_rate: 0.0
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0

# hybrid CTC/attention
model_conf:
    ctc_weight: 1.0
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false

# use raw_wav or kaldi feature
raw_wav: true

# dataset related
dataset_conf:
    filter_conf:
        max_length: 2000
        min_length: 50
        token_max_length: 400 
        token_min_length: 1
    resample_conf:
        resample_rate: 16000
    speed_perturb: false
    fbank_conf:
        num_mel_bins: 80
        frame_shift: 10
        frame_length: 25
        dither: 1.0 
    spec_aug: false
    spec_aug_conf:
        num_t_mask: 3
        num_f_mask: 2
        max_t: 50
        max_f: 10
    shuffle: true
    shuffle_conf:
        shuffle_size: 1500
    sort: true
    sort_conf:
        sort_size: 500  # sort_size should be less than shuffle_size
    batch_conf:
        batch_type: 'dynamic' # static or dynamic
        max_frames_in_batch: 20000
        batch_size: 3

pretrain: True
wav2vec_conf:
    pretrain: True
    quantize_targets: True
    project_targets: True
    latent_vars: 320
    latent_dim: 512
    latent_groups: 2
    w2v_ext_loss: True
    w2v_loss_weights: [1.5,0]
    mask: True
    mask_prob: 0.65

grad_clip: 5
accum_grad: 4
max_epoch: 280
log_interval: 100

optim: adam
optim_conf:
    lr: 0.002
scheduler: warmuplr     # pytorch v1.1.0+ required
scheduler_conf:
    warmup_steps: 25000