Spaces:
Running
on
Zero
Running
on
Zero
{ | |
"base_config": "config/ns2.json", | |
"model_type": "NaturalSpeech2", | |
"dataset": [ | |
"libritts" | |
], | |
"preprocess": { | |
"use_mel": false, | |
"use_code": true, | |
"use_spkid": true, | |
"use_pitch": true, | |
"use_duration": true, | |
"use_phone": true, | |
"use_len": true, | |
"use_cross_reference": true, | |
"train_file": "train.json", | |
"valid_file": "test.json", | |
"melspec_dir": "mel", | |
"code_dir": "code", | |
"pitch_dir": "pitch", | |
"duration_dir": "duration", | |
"metadata_dir": "metadata", | |
"read_metadata": true, | |
"clip_mode": "start" | |
}, | |
"model": { | |
"latent_dim": 128, | |
"prior_encoder": { | |
"vocab_size": 100, | |
"pitch_min": 50, | |
"pitch_max": 1100, | |
"pitch_bins_num": 512, | |
"encoder": { | |
"encoder_layer": 6, | |
"encoder_hidden": 512, | |
"encoder_head": 8, | |
"conv_filter_size": 2048, | |
"conv_kernel_size": 9, | |
"encoder_dropout": 0.2, | |
"use_cln": true | |
}, | |
"duration_predictor": { | |
"input_size": 512, | |
"filter_size": 512, | |
"kernel_size": 3, | |
"conv_layers": 30, | |
"cross_attn_per_layer": 3, | |
"attn_head": 8, | |
"drop_out": 0.5 | |
}, | |
"pitch_predictor": { | |
"input_size": 512, | |
"filter_size": 512, | |
"kernel_size": 5, | |
"conv_layers": 30, | |
"cross_attn_per_layer": 3, | |
"attn_head": 8, | |
"drop_out": 0.5 | |
} | |
}, | |
"diffusion": { | |
"wavenet": { | |
"input_size": 128, | |
"hidden_size": 512, | |
"out_size": 128, | |
"num_layers": 40, | |
"cross_attn_per_layer": 3, | |
"dilation_cycle": 2, | |
"attn_head": 8, | |
"drop_out": 0.2 | |
}, | |
"beta_min": 0.05, | |
"beta_max": 20, | |
"sigma": 1.0, | |
"noise_factor": 1.0, | |
"ode_solver": "euler", | |
"diffusion_type": "diffusion" | |
}, | |
"prompt_encoder": { | |
"encoder_layer": 6, | |
"encoder_hidden": 512, | |
"encoder_head": 8, | |
"conv_filter_size": 2048, | |
"conv_kernel_size": 9, | |
"encoder_dropout": 0.2, | |
"use_cln": false | |
}, | |
"query_emb": { | |
"query_token_num": 32, | |
"hidden_size": 512, | |
"head_num": 8 | |
}, | |
"inference_step": 500 | |
}, | |
"train": { | |
"use_dynamic_batchsize": true, | |
"max_tokens": 7500, | |
"max_sentences": 32, | |
"lr_warmup_steps": 5000, | |
"lr_scheduler": "cosine", | |
"num_train_steps": 800000, | |
"adam": { | |
"lr": 7.5e-5 | |
}, | |
"diff_ce_loss_lambda": 0.5, | |
"diff_noise_loss_lambda": 1.0, | |
"ddp": false, | |
"random_seed": 114, | |
"batch_size": 32, | |
"epochs": 5000, | |
"max_steps": 1000000, | |
"total_training_steps": 800000, | |
"save_summary_steps": 500, | |
"save_checkpoints_steps": 2000, | |
"valid_interval": 2000, | |
"keep_checkpoint_max": 100 | |
} | |
} |