File size: 2,625 Bytes
8c92a11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
{
    "base_config": "config/tts.json",
    "model_type": "VALLE",
    "task_type": "tts",
    "dataset": [
        "libritts"
    ],
    "preprocess": {
        "extract_phone": true,
        "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon 
        "extract_acoustic_token": true,
        "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
        "acoustic_token_dir": "acoutic_tokens",
        "use_text": false,
        "use_phone": true,
        "use_acoustic_token": true,
        "symbols_dict": "symbols.dict",
        "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
        "max_duration": 14, //  the duration uperbound to filter the audio with duration > max_duration.              
        "sample_rate": 24000, 
        "codec_hop_size": 320
    },
    "model": {
        "text_token_num": 512,
        "audio_token_num": 1024,
        "decoder_dim": 1024, // embedding dimension of the decoder model
        "nhead": 16, // number of attention heads in the decoder layers
        "num_decoder_layers": 12, // number of decoder layers
        "norm_first": true, // pre or post Normalization.
        "add_prenet": false, // whether add PreNet after Inputs
        "prefix_mode": 0, //  mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
        "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
        "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
        "prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
        "num_quantizers": 8, // numbert of the audio quantization layers
        // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers 
    },
    "train": {
        "use_dynamic_batchsize": false, // If use dynamic batch size
        "ddp": false,
        "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
        "max_epoch": 20, 
        "optimizer": "AdamW", 
        "scheduler": "cosine",
        "warmup_steps": 16000, // number of steps that affects how rapidly the learning rate decreases
        "total_training_steps": 800000,
        "base_lr": 1e-4, // base learning rate."
        "valid_interval": 1000,
        "log_epoch_step": 1000,
        "save_checkpoint_stride": [
            1,
            1
        ]
    }
}