Spaces:
Running
Running
{ | |
"base_config": "config/base.json", | |
"model_type": "AudioLDM", | |
"task_type": "tta", | |
"dataset": [ | |
"AudioCaps" | |
], | |
"preprocess": { | |
// feature used for model training | |
"use_spkid": false, | |
"use_uv": false, | |
"use_frame_pitch": false, | |
"use_phone_pitch": false, | |
"use_frame_energy": false, | |
"use_phone_energy": false, | |
"use_mel": false, | |
"use_audio": false, | |
"use_label": false, | |
"use_one_hot": false, | |
"cond_mask_prob": 0.1 | |
}, | |
// model | |
"model": { | |
"audioldm": { | |
"image_size": 32, | |
"in_channels": 4, | |
"out_channels": 4, | |
"model_channels": 256, | |
"attention_resolutions": [ | |
4, | |
2, | |
1 | |
], | |
"num_res_blocks": 2, | |
"channel_mult": [ | |
1, | |
2, | |
4 | |
], | |
"num_heads": 8, | |
"use_spatial_transformer": true, | |
"transformer_depth": 1, | |
"context_dim": 768, | |
"use_checkpoint": true, | |
"legacy": false | |
}, | |
"autoencoderkl": { | |
"ch": 128, | |
"ch_mult": [ | |
1, | |
1, | |
2, | |
2, | |
4 | |
], | |
"num_res_blocks": 2, | |
"in_channels": 1, | |
"z_channels": 4, | |
"out_ch": 1, | |
"double_z": true | |
}, | |
"noise_scheduler": { | |
"num_train_timesteps": 1000, | |
"beta_start": 0.00085, | |
"beta_end": 0.012, | |
"beta_schedule": "scaled_linear", | |
"clip_sample": false, | |
"steps_offset": 1, | |
"set_alpha_to_one": false, | |
"skip_prk_steps": true, | |
"prediction_type": "epsilon" | |
} | |
}, | |
// train | |
"train": { | |
"lronPlateau": { | |
"factor": 0.9, | |
"patience": 100, | |
"min_lr": 4.0e-5, | |
"verbose": true | |
}, | |
"adam": { | |
"lr": 5.0e-5, | |
"betas": [ | |
0.9, | |
0.999 | |
], | |
"weight_decay": 1.0e-2, | |
"eps": 1.0e-8 | |
} | |
} | |
} |