|
{ |
|
"base_config": "config/base.json", |
|
"model_type": "AudioLDM", |
|
"task_type": "tta", |
|
"dataset": [ |
|
"AudioCaps" |
|
], |
|
"preprocess": { |
|
|
|
"use_spkid": false, |
|
"use_uv": false, |
|
"use_frame_pitch": false, |
|
"use_phone_pitch": false, |
|
"use_frame_energy": false, |
|
"use_phone_energy": false, |
|
"use_mel": false, |
|
"use_audio": false, |
|
"use_label": false, |
|
"use_one_hot": false, |
|
"cond_mask_prob": 0.1 |
|
}, |
|
|
|
"model": { |
|
"audioldm": { |
|
"image_size": 32, |
|
"in_channels": 4, |
|
"out_channels": 4, |
|
"model_channels": 256, |
|
"attention_resolutions": [ |
|
4, |
|
2, |
|
1 |
|
], |
|
"num_res_blocks": 2, |
|
"channel_mult": [ |
|
1, |
|
2, |
|
4 |
|
], |
|
"num_heads": 8, |
|
"use_spatial_transformer": true, |
|
"transformer_depth": 1, |
|
"context_dim": 768, |
|
"use_checkpoint": true, |
|
"legacy": false |
|
}, |
|
"autoencoderkl": { |
|
"ch": 128, |
|
"ch_mult": [ |
|
1, |
|
1, |
|
2, |
|
2, |
|
4 |
|
], |
|
"num_res_blocks": 2, |
|
"in_channels": 1, |
|
"z_channels": 4, |
|
"out_ch": 1, |
|
"double_z": true |
|
}, |
|
"noise_scheduler": { |
|
"num_train_timesteps": 1000, |
|
"beta_start": 0.00085, |
|
"beta_end": 0.012, |
|
"beta_schedule": "scaled_linear", |
|
"clip_sample": false, |
|
"steps_offset": 1, |
|
"set_alpha_to_one": false, |
|
"skip_prk_steps": true, |
|
"prediction_type": "epsilon" |
|
} |
|
}, |
|
|
|
"train": { |
|
"lronPlateau": { |
|
"factor": 0.9, |
|
"patience": 100, |
|
"min_lr": 4.0e-5, |
|
"verbose": true |
|
}, |
|
"adam": { |
|
"lr": 5.0e-5, |
|
"betas": [ |
|
0.9, |
|
0.999 |
|
], |
|
"weight_decay": 1.0e-2, |
|
"eps": 1.0e-8 |
|
} |
|
} |
|
} |