File size: 4,776 Bytes
3fd6072 037093b 3fd6072 65ce136 3fd6072 ccce62b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
# Generated 2023-06-20 from:
# /netscratch/sagar/thesis/speechbrain/recipes/RescueSpeech/Enhancement/fine-tuning/hparams/sepformer_16k.yaml
# yamllint disable
# ################################
# Model: SepFormer for source separation
# https://arxiv.org/abs/2010.13154
#
# Author: Sangeet Sagar 2022
# Dataset : RescueSpeech
# ################################
# Basic parameters
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 8201
__set_seed: !apply:torch.manual_seed [8201]
experiment_name: sepformer-enhancement
output_folder: results/sepformer-enhancement/8201
train_log: results/sepformer-enhancement/8201/train_log.txt
save_folder: results/sepformer-enhancement/8201/save
# Dataset prep parameters
data_folder: dataset/audio_sythesis/Task_enhancement/ # !PLACEHOLDER
csv_dir: csv_files
train_csv: csv_files/train.csv
valid_csv: csv_files/dev.csv
test_csv: csv_files/test.csv
skip_prep: false
sample_rate: 16000
task: enhance
dereverberate: false
shuffle_train_data: true
# Pretrained models
pretrained_model_path:
/netscratch/sagar/thesis/speechbrain/recipes/RescueSpeech/pre-trained/sepformer_dns_16k # !PLACEHOLDER # sepformer_dns_16k model
# Basic parameters
use_tensorboard: false
tensorboard_logs: results/sepformer-enhancement/8201/logs/
# Experiment params
auto_mix_prec: true # Set it to True for mixed precision
test_only: false
num_spks: 1
noprogressbar: false
save_audio: true # Save estimated sources on disk
downsample: false
n_audio_to_save: 500
# Training parameters
N_epochs: 150
batch_size: 1
batch_size_test: 1
lr: 0.00015
clip_grad_norm: 5
loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
# if True, the training sequences are cut to a specified length
limit_training_signal_len: false
# this is the length of sequences if we choose to limit
# the signal length of training sequences
training_signal_len: 32000
ckpt_interval_minutes: 60
# Parameters for data augmentation
use_wavedrop: false
use_speedperturb: true
use_rand_shift: false
min_shift: -8000
max_shift: 8000
speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
perturb_prob: 1.0
drop_freq_prob: 0.0
drop_chunk_prob: 0.0
sample_rate: 16000
speeds: [95, 100, 105]
wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
perturb_prob: 0.0
drop_freq_prob: 1.0
drop_chunk_prob: 1.0
sample_rate: 16000
# loss thresholding -- this thresholds the training loss
threshold_byloss: true
threshold: -30
# Encoder parameters
N_encoder_out: 256
out_channels: 256
kernel_size: 16
kernel_stride: 8
# Dataloader options
dataloader_opts:
batch_size: 1
num_workers: 3
dataloader_opts_valid:
batch_size: 1
num_workers: 3
dataloader_opts_test:
batch_size: 1
num_workers: 3
# Specifying the network
Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder
kernel_size: 16
out_channels: 256
SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
num_layers: 8
d_model: 256
nhead: 8
d_ffn: 1024
dropout: 0
use_positional_encoding: true
norm_before: true
SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
num_layers: 8
d_model: 256
nhead: 8
d_ffn: 1024
dropout: 0
use_positional_encoding: true
norm_before: true
MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
num_spks: 1
in_channels: 256
out_channels: 256
num_layers: 2
K: 250
intra_model: *id001
inter_model: *id002
norm: ln
linear_layer_after_inter_intra: false
skip_around_intra: true
Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder
in_channels: 256
out_channels: 1
kernel_size: 16
stride: 8
bias: false
optimizer: !name:torch.optim.Adam
lr: 0.00015
weight_decay: 0
loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
factor: 0.5
patience: 2
dont_halve_until_epoch: 85
epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter
limit: 150
modules:
encoder: *id003
decoder: *id004
masknet: *id005
save_all_checkpoints: false
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: results/sepformer-enhancement/8201/save
recoverables:
encoder: *id003
decoder: *id004
masknet: *id005
counter: *id006
lr_scheduler: *id007
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: results/sepformer-enhancement/8201/train_log.txt
## Uncomment if you wish to fine-tune a pre-trained model.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
encoder: !ref <Encoder>
masknet: !ref <MaskNet>
decoder: !ref <Decoder> |