|
{ |
|
"base_config": "config/base.json", |
|
"task_type": "svc", |
|
"preprocess": { |
|
|
|
"use_pitch_shift": false, |
|
"use_formant_shift": false, |
|
"use_time_stretch": false, |
|
"use_equalizer": false, |
|
|
|
"features_extraction_mode": "offline", |
|
|
|
"extract_mel": true, |
|
"mel_min_max_norm": true, |
|
"extract_pitch": true, |
|
"pitch_extractor": "parselmouth", |
|
"extract_uv": true, |
|
"extract_energy": true, |
|
|
|
"extract_whisper_feature": false, |
|
"whisper_sample_rate": 16000, |
|
"extract_contentvec_feature": false, |
|
"contentvec_sample_rate": 16000, |
|
"extract_wenet_feature": false, |
|
"wenet_sample_rate": 16000, |
|
"extract_mert_feature": false, |
|
"mert_sample_rate": 16000, |
|
|
|
"whisper_frameshift": 0.01, |
|
"whisper_downsample_rate": 2, |
|
|
|
"contentvec_frameshift": 0.02, |
|
|
|
"mert_model": "m-a-p/MERT-v1-330M", |
|
"mert_feature_layer": -1, |
|
"mert_hop_size": 320, |
|
|
|
"mert_frameshit": 0.01333, |
|
|
|
"wenet_frameshift": 0.01, |
|
|
|
"wenet_downsample_rate": 4, |
|
|
|
"n_mel": 100, |
|
"win_size": 1024, |
|
|
|
"hop_size": 256, |
|
"sample_rate": 24000, |
|
"n_fft": 1024, |
|
|
|
"fmin": 0, |
|
"fmax": 12000, |
|
|
|
"f0_min": 50, |
|
|
|
"f0_max": 1100, |
|
|
|
"pitch_bin": 256, |
|
"pitch_max": 1100.0, |
|
"pitch_min": 50.0, |
|
"is_label": true, |
|
"is_mu_law": true, |
|
"bits": 8, |
|
"mel_min_max_stats_dir": "mel_min_max_stats", |
|
"whisper_dir": "whisper", |
|
"contentvec_dir": "contentvec", |
|
"wenet_dir": "wenet", |
|
"mert_dir": "mert", |
|
|
|
"pin_memory": true, |
|
"num_workers": 8, |
|
"content_feature_batch_size": 16, |
|
|
|
"use_mel": true, |
|
"use_min_max_norm_mel": true, |
|
"use_frame_pitch": true, |
|
"use_uv": true, |
|
"use_interpolation_for_uv": false, |
|
"use_frame_energy": true, |
|
"use_log_scale_pitch": false, |
|
"use_log_scale_energy": false, |
|
"use_spkid": true, |
|
|
|
"train_file": "train.json", |
|
"valid_file": "test.json", |
|
"spk2id": "singers.json", |
|
"utt2spk": "utt2singer" |
|
}, |
|
"model": { |
|
"condition_encoder": { |
|
"merge_mode": "add", |
|
|
|
"use_f0": true, |
|
"use_uv": true, |
|
"use_energy": true, |
|
|
|
"input_melody_dim": 1, |
|
"n_bins_melody": 256, |
|
"output_melody_dim": 384, |
|
"input_loudness_dim": 1, |
|
"n_bins_loudness": 256, |
|
"output_loudness_dim": 384, |
|
|
|
"use_whisper": false, |
|
"use_contentvec": false, |
|
"use_wenet": false, |
|
"use_mert": false, |
|
"whisper_dim": 1024, |
|
"contentvec_dim": 256, |
|
"mert_dim": 256, |
|
"wenet_dim": 512, |
|
"content_encoder_dim": 384, |
|
|
|
"output_singer_dim": 384, |
|
"singer_table_size": 512, |
|
"use_spkid": true |
|
} |
|
}, |
|
} |