|
{ |
|
"base_config": "config/base.json", |
|
"model_type": "NaturalSpeech2", |
|
"dataset": ["libritts"], |
|
"preprocess": { |
|
"use_mel": false, |
|
"use_code": true, |
|
"use_spkid": true, |
|
"use_pitch": true, |
|
"use_duration": true, |
|
"use_phone": true, |
|
"use_len": true, |
|
"use_cross_reference": true, |
|
"train_file": "train.json", |
|
"melspec_dir": "mel", |
|
"code_dir": "code", |
|
"pitch_dir": "pitch", |
|
"duration_dir": "duration", |
|
"clip_mode": "start" |
|
}, |
|
"model": { |
|
"latent_dim": 128, |
|
"prior_encoder": { |
|
"vocab_size": 100, |
|
"pitch_min": 50, |
|
"pitch_max": 1100, |
|
"pitch_bins_num": 512, |
|
"encoder": { |
|
"encoder_layer": 6, |
|
"encoder_hidden": 512, |
|
"encoder_head": 8, |
|
"conv_filter_size": 2048, |
|
"conv_kernel_size": 9, |
|
"encoder_dropout": 0.2, |
|
"use_cln": true |
|
}, |
|
"duration_predictor": { |
|
"input_size": 512, |
|
"filter_size": 512, |
|
"kernel_size": 3, |
|
"conv_layers": 30, |
|
"cross_attn_per_layer": 3, |
|
"attn_head": 8, |
|
"drop_out": 0.5 |
|
}, |
|
"pitch_predictor": { |
|
"input_size": 512, |
|
"filter_size": 512, |
|
"kernel_size": 5, |
|
"conv_layers": 30, |
|
"cross_attn_per_layer": 3, |
|
"attn_head": 8, |
|
"drop_out": 0.5 |
|
} |
|
}, |
|
"diffusion": { |
|
"wavenet": { |
|
"input_size": 128, |
|
"hidden_size": 512, |
|
"out_size": 128, |
|
"num_layers": 40, |
|
"cross_attn_per_layer": 3, |
|
"dilation_cycle": 2, |
|
"attn_head": 8, |
|
"drop_out": 0.2 |
|
}, |
|
"beta_min": 0.05, |
|
"beta_max": 20, |
|
"sigma": 1.0, |
|
"noise_factor": 1.0, |
|
"ode_solver": "euler" |
|
}, |
|
"prompt_encoder": { |
|
"encoder_layer": 6, |
|
"encoder_hidden": 512, |
|
"encoder_head": 8, |
|
"conv_filter_size": 2048, |
|
"conv_kernel_size": 9, |
|
"encoder_dropout": 0.2, |
|
"use_cln": false |
|
}, |
|
"query_emb": { |
|
"query_token_num": 32, |
|
"hidden_size": 512, |
|
"head_num": 8 |
|
} |
|
} |
|
} |