Spaces:
Running
Running
{ | |
"base_config": "config/valle.json", | |
"model_type": "VALLE", | |
"dataset": [ | |
"libritts" | |
], | |
"dataset_path": { | |
"libritts": "[LibriTTS dataset path]" | |
}, | |
"preprocess": { | |
"extract_phone": true, | |
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" | |
"extract_acoustic_token": true, | |
"use_phone": true, | |
"use_acoustic_token": true, | |
"processed_dir": "Amphion/data/", | |
"sample_rate": 24000, // "Audio sampling rate." | |
"codec_hop_size": 320, // "Audio codec hop size." | |
"valid_file": "test.json", | |
}, | |
"model": { | |
"prefix_mode": 1, // "The mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance.", | |
}, | |
"log_dir": "Amphion/ckpts/tts/valle", | |
"train": { | |
"batch_size": 4, | |
"train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s) | |
"max_epoch": 20, // "Number of epochs to train." | |
"use_dynamic_batchsize": true, // If use dynamic batch size | |
"max_tokens": 4000, // If use dynamic batch size | |
"max_sentences": 10 // If use dynamic batch size | |
} | |
} | |