maskgct

Running

maskgct / egs /tts /VALLE /exp_config.json

Upload 167 files

8c92a11 verified 14 days ago

1.29 kB

	{
	"base_config": "config/valle.json",
	"model_type": "VALLE",
	"dataset": [
	"libritts"
	],
	"dataset_path": {
	"libritts": "[LibriTTS dataset path]"
	},
	"preprocess": {
	"extract_phone": true,
	"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
	"extract_acoustic_token": true,
	"use_phone": true,
	"use_acoustic_token": true,
	"processed_dir": "Amphion/data/",
	"sample_rate": 24000, // "Audio sampling rate."
	"codec_hop_size": 320, // "Audio codec hop size."
	"valid_file": "test.json",
	},
	"model": {
	"prefix_mode": 1, // "The mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance.",
	},
	"log_dir": "Amphion/ckpts/tts/valle",
	"train": {
	"batch_size": 4,
	"train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
	"max_epoch": 20, // "Number of epochs to train."
	"use_dynamic_batchsize": true, // If use dynamic batch size
	"max_tokens": 4000, // If use dynamic batch size
	"max_sentences": 10 // If use dynamic batch size
	}
	}