|
{ |
|
"base_config": "config/vitssvc.json", |
|
"model_type": "VitsSVC", |
|
"dataset": [ |
|
"m4singer", |
|
"opencpop", |
|
"opensinger", |
|
"svcc", |
|
"vctk" |
|
], |
|
"dataset_path": { |
|
|
|
"m4singer": "[M4Singer dataset path]", |
|
"opencpop": "[Opencpop dataset path]", |
|
"opensinger": "[OpenSinger dataset path]", |
|
"svcc": "[SVCC dataset path]", |
|
"vctk": "[VCTK dataset path]" |
|
}, |
|
"use_custom_dataset": [], |
|
|
|
"log_dir": "ckpts/svc", |
|
"preprocess": { |
|
|
|
"processed_dir": "data", |
|
|
|
"n_mel": 100, |
|
"sample_rate": 24000, |
|
|
|
|
|
"extract_contentvec_feature": true, |
|
"contentvec_sample_rate": 16000, |
|
"contentvec_batch_size": 1, |
|
"contentvec_frameshift": 0.02, |
|
|
|
"extract_whisper_feature": true, |
|
"whisper_sample_rate": 16000, |
|
"whisper_frameshift": 0.01, |
|
"whisper_downsample_rate": 2, |
|
|
|
"extract_wenet_feature": true, |
|
"wenet_downsample_rate": 4, |
|
"wenet_frameshift": 0.01, |
|
"wenet_sample_rate": 16000, |
|
|
|
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", |
|
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", |
|
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", |
|
"whisper_model": "medium", |
|
"whisper_model_path": "pretrained/whisper/medium.pt", |
|
|
|
"use_contentvec": true, |
|
"use_whisper": true, |
|
"use_wenet": false, |
|
|
|
|
|
"pin_memory": true, |
|
"num_workers": 8, |
|
"content_feature_batch_size": 16, |
|
|
|
}, |
|
"model": { |
|
"condition_encoder": { |
|
|
|
"merge_mode": "add", |
|
"use_log_loudness": true, |
|
"use_contentvec": true, |
|
"use_whisper": true, |
|
"use_wenet": false, |
|
"whisper_dim": 1024, |
|
"contentvec_dim": 256, |
|
"wenet_dim": 512, |
|
}, |
|
"vits": { |
|
"inter_channels": 384, |
|
"hidden_channels": 384, |
|
"filter_channels": 256, |
|
"n_heads": 2, |
|
"n_layers": 6, |
|
"kernel_size": 3, |
|
"p_dropout": 0.1, |
|
"n_flow_layer": 4, |
|
"n_layers_q": 3, |
|
"gin_channels": 256, |
|
"n_speakers": 512, |
|
"use_spectral_norm": false, |
|
}, |
|
"generator": "nsfhifigan", |
|
}, |
|
"train": { |
|
"batch_size": 32, |
|
"learning_rate": 2e-4, |
|
"gradient_accumulation_step": 1, |
|
"max_epoch": -1, |
|
"save_checkpoint_stride": [ |
|
3, |
|
50 |
|
], |
|
"keep_last": [ |
|
3, |
|
2 |
|
], |
|
}, |
|
"inference": { |
|
"batch_size": 1, |
|
} |
|
} |