{ "base_config": "config/base.json", "task_type": "svc", "preprocess": { // data augmentations "use_pitch_shift": false, "use_formant_shift": false, "use_time_stretch": false, "use_equalizer": false, // Online or offline features extraction ("offline" or "online") "features_extraction_mode": "offline", // acoustic features "extract_mel": true, "mel_min_max_norm": true, "extract_pitch": true, "pitch_extractor": "parselmouth", "extract_uv": true, "extract_energy": true, // content features "extract_whisper_feature": false, "whisper_sample_rate": 16000, "extract_contentvec_feature": false, "contentvec_sample_rate": 16000, "extract_wenet_feature": false, "wenet_sample_rate": 16000, "extract_mert_feature": false, "mert_sample_rate": 16000, // Default config for whisper "whisper_frameshift": 0.01, "whisper_downsample_rate": 2, // Default config for content vector "contentvec_frameshift": 0.02, // Default config for mert "mert_model": "m-a-p/MERT-v1-330M", "mert_feature_layer": -1, "mert_hop_size": 320, // 24k "mert_frameshit": 0.01333, // 10ms "wenet_frameshift": 0.01, // wenetspeech is 4, gigaspeech is 6 "wenet_downsample_rate": 4, // Default config "n_mel": 100, "win_size": 1024, // todo "hop_size": 256, "sample_rate": 24000, "n_fft": 1024, // todo "fmin": 0, "fmax": 12000, // todo "f0_min": 50, // ~C2 "f0_max": 1100, //1100, // ~C6(1100), ~G5(800) "pitch_bin": 256, "pitch_max": 1100.0, "pitch_min": 50.0, "is_label": true, "is_mu_law": true, "bits": 8, "mel_min_max_stats_dir": "mel_min_max_stats", "whisper_dir": "whisper", "contentvec_dir": "contentvec", "wenet_dir": "wenet", "mert_dir": "mert", // Extract content features using dataloader "pin_memory": true, "num_workers": 8, "content_feature_batch_size": 16, // Features used for model training "use_mel": true, "use_min_max_norm_mel": true, "use_frame_pitch": true, "use_uv": true, "use_interpolation_for_uv": false, "use_frame_energy": true, "use_log_scale_pitch": false, "use_log_scale_energy": false, "use_spkid": true, // Meta file "train_file": "train.json", "valid_file": "test.json", "spk2id": "singers.json", "utt2spk": "utt2singer" }, "model": { "condition_encoder": { "merge_mode": "add", // Prosody Features "use_f0": true, "use_uv": true, "use_energy": true, // Quantization (0 for not quantization) "input_melody_dim": 1, "n_bins_melody": 256, "output_melody_dim": 384, "input_loudness_dim": 1, "n_bins_loudness": 256, "output_loudness_dim": 384, // Semantic Features "use_whisper": false, "use_contentvec": false, "use_wenet": false, "use_mert": false, "whisper_dim": 1024, "contentvec_dim": 256, "mert_dim": 256, "wenet_dim": 512, "content_encoder_dim": 384, // Speaker Features "output_singer_dim": 384, "singer_table_size": 512, "use_spkid": true } }, }