File size: 4,776 Bytes
3fd6072
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
037093b
3fd6072
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65ce136
3fd6072
ccce62b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# Generated 2023-06-20 from:
# /netscratch/sagar/thesis/speechbrain/recipes/RescueSpeech/Enhancement/fine-tuning/hparams/sepformer_16k.yaml
# yamllint disable
# ################################
# Model: SepFormer for source separation
# https://arxiv.org/abs/2010.13154
#
# Author:  Sangeet Sagar 2022
# Dataset : RescueSpeech
# ################################

# Basic parameters
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 8201
__set_seed: !apply:torch.manual_seed [8201]
experiment_name: sepformer-enhancement
output_folder: results/sepformer-enhancement/8201
train_log: results/sepformer-enhancement/8201/train_log.txt
save_folder: results/sepformer-enhancement/8201/save

# Dataset prep parameters
data_folder: dataset/audio_sythesis/Task_enhancement/ # !PLACEHOLDER
csv_dir: csv_files
train_csv: csv_files/train.csv
valid_csv: csv_files/dev.csv
test_csv: csv_files/test.csv
skip_prep: false
sample_rate: 16000
task: enhance

dereverberate: false
shuffle_train_data: true

# Pretrained models
pretrained_model_path: 
  /netscratch/sagar/thesis/speechbrain/recipes/RescueSpeech/pre-trained/sepformer_dns_16k                      # !PLACEHOLDER # sepformer_dns_16k model

# Basic parameters
use_tensorboard: false
tensorboard_logs: results/sepformer-enhancement/8201/logs/

# Experiment params
auto_mix_prec: true     # Set it to True for mixed precision
test_only: false
num_spks: 1
noprogressbar: false
save_audio: true        # Save estimated sources on disk
downsample: false
n_audio_to_save: 500

# Training parameters
N_epochs: 150
batch_size: 1
batch_size_test: 1
lr: 0.00015
clip_grad_norm: 5
loss_upper_lim: 999999  # this is the upper limit for an acceptable loss
# if True, the training sequences are cut to a specified length
limit_training_signal_len: false
# this is the length of sequences if we choose to limit
# the signal length of training sequences
training_signal_len: 32000
ckpt_interval_minutes: 60

# Parameters for data augmentation
use_wavedrop: false
use_speedperturb: true
use_rand_shift: false
min_shift: -8000
max_shift: 8000

speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  perturb_prob: 1.0
  drop_freq_prob: 0.0
  drop_chunk_prob: 0.0
  sample_rate: 16000
  speeds: [95, 100, 105]

wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  perturb_prob: 0.0
  drop_freq_prob: 1.0
  drop_chunk_prob: 1.0
  sample_rate: 16000

# loss thresholding -- this thresholds the training loss
threshold_byloss: true
threshold: -30

# Encoder parameters
N_encoder_out: 256
out_channels: 256
kernel_size: 16
kernel_stride: 8

# Dataloader options
dataloader_opts:
  batch_size: 1
  num_workers: 3

dataloader_opts_valid:
  batch_size: 1
  num_workers: 3

dataloader_opts_test:
  batch_size: 1
  num_workers: 3

# Specifying the network
Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder
  kernel_size: 16
  out_channels: 256

SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
  num_layers: 8
  d_model: 256
  nhead: 8
  d_ffn: 1024
  dropout: 0
  use_positional_encoding: true
  norm_before: true

SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
  num_layers: 8
  d_model: 256
  nhead: 8
  d_ffn: 1024
  dropout: 0
  use_positional_encoding: true
  norm_before: true

MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model

  num_spks: 1
  in_channels: 256
  out_channels: 256
  num_layers: 2
  K: 250
  intra_model: *id001
  inter_model: *id002
  norm: ln
  linear_layer_after_inter_intra: false
  skip_around_intra: true

Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder
  in_channels: 256
  out_channels: 1
  kernel_size: 16
  stride: 8
  bias: false

optimizer: !name:torch.optim.Adam
  lr: 0.00015
  weight_decay: 0

loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper

lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau

  factor: 0.5
  patience: 2
  dont_halve_until_epoch: 85

epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter
  limit: 150

modules:
  encoder: *id003
  decoder: *id004
  masknet: *id005
save_all_checkpoints: false
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: results/sepformer-enhancement/8201/save
  recoverables:
    encoder: *id003
    decoder: *id004
    masknet: *id005
    counter: *id006
    lr_scheduler: *id007
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: results/sepformer-enhancement/8201/train_log.txt

## Uncomment if you wish to fine-tune a pre-trained model.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
  loadables:
    encoder: !ref <Encoder>
    masknet: !ref <MaskNet>
    decoder: !ref <Decoder>