{ "base_config": "config/vits.json", "model_type": "VITS", "dataset": [ "LJSpeech", //"hifitts" ], "dataset_path": { // TODO: Fill in your dataset path "LJSpeech": "[LJSpeech dataset path]", //"hifitts": "[Hi-Fi TTS dataset path] }, // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" "log_dir": "ckpts/tts", "preprocess": { //"extract_audio":true, "use_phone": true, // linguistic features "extract_phone": true, "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" // TODO: Fill in the output data path. The default value is "Amphion/data" "processed_dir": "data", "sample_rate": 22050, // target sampling rate "valid_file": "valid.json", // validation set //"use_spkid": true // use speaker ID to train multi-speaker TTS model }, "model":{ //"n_speakers": 10 // number of speakers, greater than or equal to the number of speakers in the dataset(s) used. The default value is 0 if not specified. }, "train": { "batch_size": 16, //"multi_speaker_training": true } }