|
{ |
|
"base_config": "config/svc/base.json", |
|
"model_type": "VITS", |
|
"task_type": "svc", |
|
"preprocess": { |
|
|
|
"extract_mel": true, |
|
"extract_pitch": true, |
|
"pitch_extractor": "parselmouth", |
|
"extract_energy": true, |
|
"extract_uv": true, |
|
"extract_linear_spec": true, |
|
"extract_audio": true, |
|
"mel_min_max_norm": true, |
|
|
|
"use_linear": true, |
|
"use_mel": true, |
|
"use_min_max_norm_mel": false, |
|
"use_audio": true, |
|
"use_frame_pitch": true, |
|
"use_uv": true, |
|
"use_spkid": true, |
|
"use_contentvec": false, |
|
"use_whisper": false, |
|
"use_wenet": false, |
|
"use_text": false, |
|
"use_phone": false, |
|
"fmin": 0, |
|
"fmax": 12000, |
|
"f0_min": 50, |
|
"f0_max": 1100, |
|
|
|
"pitch_bin": 256, |
|
|
|
"n_fft": 1024, |
|
|
|
"hop_size": 256, |
|
|
|
"win_size": 1024, |
|
"segment_size": 8192, |
|
"n_mel": 100, |
|
"sample_rate": 24000, |
|
"mel_min_max_stats_dir": "mel_min_max_stats", |
|
"whisper_dir": "whisper", |
|
"contentvec_dir": "contentvec", |
|
"wenet_dir": "wenet", |
|
"mert_dir": "mert", |
|
|
|
"train_file": "train.json", |
|
"valid_file": "test.json", |
|
"spk2id": "singers.json", |
|
"utt2spk": "utt2singer" |
|
}, |
|
"model": { |
|
"condition_encoder": { |
|
"merge_mode": "add", |
|
"input_melody_dim": 1, |
|
"use_log_f0": true, |
|
"n_bins_melody": 256, |
|
"output_melody_dim": 384, |
|
"input_loudness_dim": 1, |
|
"use_log_loudness": true, |
|
"n_bins_loudness": 256, |
|
"output_loudness_dim": 384, |
|
"use_whisper": false, |
|
"use_contentvec": false, |
|
"use_wenet": false, |
|
"use_mert": false, |
|
"whisper_dim": 1024, |
|
"contentvec_dim": 256, |
|
"mert_dim": 256, |
|
"wenet_dim": 512, |
|
"content_encoder_dim": 384, |
|
"singer_table_size": 512, |
|
"output_singer_dim": 384, |
|
"output_content_dim": 384, |
|
"use_spkid": true, |
|
"pitch_max": 1100.0, |
|
"pitch_min": 50.0, |
|
}, |
|
"vits": { |
|
"filter_channels": 256, |
|
"gin_channels": 256, |
|
"hidden_channels": 384, |
|
"inter_channels": 384, |
|
"kernel_size": 3, |
|
"n_flow_layer": 4, |
|
"n_heads": 2, |
|
"n_layers": 6, |
|
"n_layers_q": 3, |
|
"n_speakers": 512, |
|
"p_dropout": 0.1, |
|
"use_spectral_norm": false, |
|
}, |
|
"generator": "hifigan", |
|
"generator_config": { |
|
"hifigan": { |
|
"resblock": "1", |
|
"resblock_kernel_sizes": [ |
|
3, |
|
7, |
|
11 |
|
], |
|
"upsample_rates": [ |
|
8, |
|
8, |
|
2, |
|
2 |
|
], |
|
"upsample_kernel_sizes": [ |
|
16, |
|
16, |
|
4, |
|
4 |
|
], |
|
"upsample_initial_channel": 512, |
|
"resblock_dilation_sizes": [ |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
] |
|
] |
|
}, |
|
"melgan": { |
|
"ratios": [ |
|
8, |
|
8, |
|
2, |
|
2 |
|
], |
|
"ngf": 32, |
|
"n_residual_layers": 3, |
|
"num_D": 3, |
|
"ndf": 16, |
|
"n_layers": 4, |
|
"downsampling_factor": 4 |
|
}, |
|
"bigvgan": { |
|
"resblock": "1", |
|
"activation": "snakebeta", |
|
"snake_logscale": true, |
|
"upsample_rates": [ |
|
8, |
|
8, |
|
2, |
|
2 |
|
], |
|
"upsample_kernel_sizes": [ |
|
16, |
|
16, |
|
4, |
|
4 |
|
], |
|
"upsample_initial_channel": 512, |
|
"resblock_kernel_sizes": [ |
|
3, |
|
7, |
|
11 |
|
], |
|
"resblock_dilation_sizes": [ |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
] |
|
] |
|
}, |
|
"nsfhifigan": { |
|
"resblock": "1", |
|
"harmonic_num": 8, |
|
"upsample_rates": [ |
|
8, |
|
8, |
|
2, |
|
2 |
|
], |
|
"upsample_kernel_sizes": [ |
|
16, |
|
16, |
|
4, |
|
4 |
|
], |
|
"upsample_initial_channel": 768, |
|
"resblock_kernel_sizes": [ |
|
3, |
|
7, |
|
11 |
|
], |
|
"resblock_dilation_sizes": [ |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
] |
|
] |
|
}, |
|
"apnet": { |
|
"ASP_channel": 512, |
|
"ASP_resblock_kernel_sizes": [ |
|
3, |
|
7, |
|
11 |
|
], |
|
"ASP_resblock_dilation_sizes": [ |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
] |
|
], |
|
"ASP_input_conv_kernel_size": 7, |
|
"ASP_output_conv_kernel_size": 7, |
|
"PSP_channel": 512, |
|
"PSP_resblock_kernel_sizes": [ |
|
3, |
|
7, |
|
11 |
|
], |
|
"PSP_resblock_dilation_sizes": [ |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
] |
|
], |
|
"PSP_input_conv_kernel_size": 7, |
|
"PSP_output_R_conv_kernel_size": 7, |
|
"PSP_output_I_conv_kernel_size": 7, |
|
} |
|
}, |
|
}, |
|
"train": { |
|
"fp16_run": true, |
|
"learning_rate": 2e-4, |
|
"betas": [ |
|
0.8, |
|
0.99 |
|
], |
|
"eps": 1e-9, |
|
"batch_size": 16, |
|
"lr_decay": 0.999875, |
|
|
|
"init_lr_ratio": 1, |
|
"warmup_epochs": 0, |
|
"c_mel": 45, |
|
"c_kl": 1.0, |
|
"AdamW": { |
|
"betas": [ |
|
0.8, |
|
0.99 |
|
], |
|
"eps": 1e-9, |
|
} |
|
} |
|
} |