Spaces:
Running
on
Zero
Running
on
Zero
{ | |
"base_config": "config/base.json", | |
"model_type": "NaturalSpeech2", | |
"dataset": ["libritts"], | |
"preprocess": { | |
"use_mel": false, | |
"use_code": true, | |
"use_spkid": true, | |
"use_pitch": true, | |
"use_duration": true, | |
"use_phone": true, | |
"use_len": true, | |
"use_cross_reference": true, | |
"train_file": "train.json", | |
"melspec_dir": "mel", | |
"code_dir": "code", | |
"pitch_dir": "pitch", | |
"duration_dir": "duration", | |
"clip_mode": "start" | |
}, | |
"model": { | |
"latent_dim": 128, | |
"prior_encoder": { | |
"vocab_size": 100, | |
"pitch_min": 50, | |
"pitch_max": 1100, | |
"pitch_bins_num": 512, | |
"encoder": { | |
"encoder_layer": 6, | |
"encoder_hidden": 512, | |
"encoder_head": 8, | |
"conv_filter_size": 2048, | |
"conv_kernel_size": 9, | |
"encoder_dropout": 0.2, | |
"use_cln": true | |
}, | |
"duration_predictor": { | |
"input_size": 512, | |
"filter_size": 512, | |
"kernel_size": 3, | |
"conv_layers": 30, | |
"cross_attn_per_layer": 3, | |
"attn_head": 8, | |
"drop_out": 0.5 | |
}, | |
"pitch_predictor": { | |
"input_size": 512, | |
"filter_size": 512, | |
"kernel_size": 5, | |
"conv_layers": 30, | |
"cross_attn_per_layer": 3, | |
"attn_head": 8, | |
"drop_out": 0.5 | |
} | |
}, | |
"diffusion": { | |
"wavenet": { | |
"input_size": 128, | |
"hidden_size": 512, | |
"out_size": 128, | |
"num_layers": 40, | |
"cross_attn_per_layer": 3, | |
"dilation_cycle": 2, | |
"attn_head": 8, | |
"drop_out": 0.2 | |
}, | |
"beta_min": 0.05, | |
"beta_max": 20, | |
"sigma": 1.0, | |
"noise_factor": 1.0, | |
"ode_solver": "euler" | |
}, | |
"prompt_encoder": { | |
"encoder_layer": 6, | |
"encoder_hidden": 512, | |
"encoder_head": 8, | |
"conv_filter_size": 2048, | |
"conv_kernel_size": 9, | |
"encoder_dropout": 0.2, | |
"use_cln": false | |
}, | |
"query_emb": { | |
"query_token_num": 32, | |
"hidden_size": 512, | |
"head_num": 8 | |
} | |
} | |
} |