Spaces:
Runtime error
Runtime error
{ | |
"supported_model_type": [ | |
"GANVocoder", | |
"Fastspeech2", | |
"DiffSVC", | |
"Transformer", | |
"EDM", | |
"CD" | |
], | |
"task_type": "", | |
"dataset": [], | |
"use_custom_dataset": [], | |
"preprocess": { | |
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon" | |
// trim audio silence | |
"data_augment": false, | |
"trim_silence": false, | |
"num_silent_frames": 8, | |
"trim_fft_size": 512, // fft size used in trimming | |
"trim_hop_size": 128, // hop size used in trimming | |
"trim_top_db": 30, // top db used in trimming sensitive to each dataset | |
// acoustic features | |
"extract_mel": false, | |
"mel_extract_mode": "", | |
"extract_linear_spec": false, | |
"extract_mcep": false, | |
"extract_pitch": false, | |
"extract_acoustic_token": false, | |
"pitch_remove_outlier": false, | |
"extract_uv": false, | |
"pitch_norm": false, | |
"extract_audio": false, | |
"extract_label": false, | |
"pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform) | |
"extract_energy": false, | |
"energy_remove_outlier": false, | |
"energy_norm": false, | |
"energy_extract_mode": "from_mel", | |
"extract_duration": false, | |
"extract_amplitude_phase": false, | |
"mel_min_max_norm": false, | |
// lingusitic features | |
"extract_phone": false, | |
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt", | |
// content features | |
"extract_whisper_feature": false, | |
"extract_contentvec_feature": false, | |
"extract_mert_feature": false, | |
"extract_wenet_feature": false, | |
// Settings for data preprocessing | |
"n_mel": 80, | |
"win_size": 480, | |
"hop_size": 120, | |
"sample_rate": 24000, | |
"n_fft": 1024, | |
"fmin": 0, | |
"fmax": 12000, | |
"min_level_db": -115, | |
"ref_level_db": 20, | |
"bits": 8, | |
// Directory names of processed data or extracted features | |
"processed_dir": "processed_data", | |
"trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav | |
"raw_data": "raw_data", | |
"phone_dir": "phones", | |
"wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform) | |
"audio_dir": "audios", | |
"log_amplitude_dir": "log_amplitudes", | |
"phase_dir": "phases", | |
"real_dir": "reals", | |
"imaginary_dir": "imaginarys", | |
"label_dir": "labels", | |
"linear_dir": "linears", | |
"mel_dir": "mels", // directory name of extraced mel features | |
"mcep_dir": "mcep", // directory name of extraced mcep features | |
"dur_dir": "durs", | |
"symbols_dict": "symbols.dict", | |
"lab_dir": "labs", // directory name of extraced label features | |
"wenet_dir": "wenet", // directory name of extraced wenet features | |
"contentvec_dir": "contentvec", // directory name of extraced wenet features | |
"pitch_dir": "pitches", // directory name of extraced pitch features | |
"energy_dir": "energys", // directory name of extracted energy features | |
"phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features | |
"phone_energy_dir": "phone_energys", // directory name of extracted energy features | |
"uv_dir": "uvs", // directory name of extracted unvoiced features | |
"duration_dir": "duration", // ground-truth duration file | |
"phone_seq_file": "phone_seq_file", // phoneme sequence file | |
"file_lst": "file.lst", | |
"train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance | |
"valid_file": "valid.json", // validattion set | |
"spk2id": "spk2id.json", // used for multi-speaker dataset | |
"utt2spk": "utt2spk", // used for multi-speaker dataset | |
"emo2id": "emo2id.json", // used for multi-emotion dataset | |
"utt2emo": "utt2emo", // used for multi-emotion dataset | |
// Features used for model training | |
"use_text": false, | |
"use_phone": false, | |
"use_phn_seq": false, | |
"use_lab": false, | |
"use_linear": false, | |
"use_mel": false, | |
"use_min_max_norm_mel": false, | |
"use_wav": false, | |
"use_phone_pitch": false, | |
"use_log_scale_pitch": false, | |
"use_phone_energy": false, | |
"use_phone_duration": false, | |
"use_log_scale_energy": false, | |
"use_wenet": false, | |
"use_dur": false, | |
"use_spkid": false, // True: use speaker id for multi-speaker dataset | |
"use_emoid": false, // True: use emotion id for multi-emotion dataset | |
"use_frame_pitch": false, | |
"use_uv": false, | |
"use_frame_energy": false, | |
"use_frame_duration": false, | |
"use_audio": false, | |
"use_label": false, | |
"use_one_hot": false, | |
"use_amplitude_phase": false, | |
"align_mel_duration": false | |
}, | |
"train": { | |
"ddp": true, | |
"batch_size": 16, | |
"max_steps": 1000000, | |
// Trackers | |
"tracker": [ | |
"tensorboard" | |
// "wandb", | |
// "cometml", | |
// "mlflow", | |
], | |
"max_epoch": -1, | |
// -1 means no limit | |
"save_checkpoint_stride": [ | |
5, | |
20 | |
], | |
// unit is epoch | |
"keep_last": [ | |
3, | |
-1 | |
], | |
// -1 means infinite, if one number will broadcast | |
"run_eval": [ | |
false, | |
true | |
], | |
// if one number will broadcast | |
// Fix the random seed | |
"random_seed": 10086, | |
// Optimizer | |
"optimizer": "AdamW", | |
"adamw": { | |
"lr": 4.0e-4 | |
// nn model lr | |
}, | |
// LR Scheduler | |
"scheduler": "ReduceLROnPlateau", | |
"reducelronplateau": { | |
"factor": 0.8, | |
"patience": 10, | |
// unit is epoch | |
"min_lr": 1.0e-4 | |
}, | |
// Batchsampler | |
"sampler": { | |
"holistic_shuffle": true, | |
"drop_last": true | |
}, | |
// Dataloader | |
"dataloader": { | |
"num_worker": 32, | |
"pin_memory": true | |
}, | |
"gradient_accumulation_step": 1, | |
"total_training_steps": 50000, | |
"save_summary_steps": 500, | |
"save_checkpoints_steps": 10000, | |
"valid_interval": 10000, | |
"keep_checkpoint_max": 5, | |
"multi_speaker_training": false // True: train multi-speaker model; False: training single-speaker model; | |
} | |
} |