|
{ |
|
"base_config": "config/base.json", |
|
"model_type": "AutoencoderKL", |
|
"task_type": "tta", |
|
"dataset": [ |
|
"AudioCaps" |
|
], |
|
"preprocess": { |
|
|
|
"use_spkid": false, |
|
"use_uv": false, |
|
"use_frame_pitch": false, |
|
"use_phone_pitch": false, |
|
"use_frame_energy": false, |
|
"use_phone_energy": false, |
|
"use_mel": false, |
|
"use_audio": false, |
|
"use_label": false, |
|
"use_one_hot": false |
|
}, |
|
|
|
"model": { |
|
"autoencoderkl": { |
|
"ch": 128, |
|
"ch_mult": [ |
|
1, |
|
1, |
|
2, |
|
2, |
|
4 |
|
], |
|
"num_res_blocks": 2, |
|
"in_channels": 1, |
|
"z_channels": 4, |
|
"out_ch": 1, |
|
"double_z": true |
|
}, |
|
"loss": { |
|
"kl_weight": 1e-8, |
|
"disc_weight": 0.5, |
|
"disc_factor": 1.0, |
|
"logvar_init": 0.0, |
|
"min_adapt_d_weight": 0.0, |
|
"max_adapt_d_weight": 10.0, |
|
"disc_start": 50001, |
|
"disc_in_channels": 1, |
|
"disc_num_layers": 3, |
|
"use_actnorm": false |
|
} |
|
}, |
|
|
|
"train": { |
|
"lronPlateau": { |
|
"factor": 0.9, |
|
"patience": 100, |
|
"min_lr": 4.0e-5, |
|
"verbose": true |
|
}, |
|
"adam": { |
|
"lr": 4.0e-4, |
|
"betas": [ |
|
0.9, |
|
0.999 |
|
], |
|
"weight_decay": 1.0e-2, |
|
"eps": 1.0e-8 |
|
} |
|
} |
|
} |