Spaces:
Running
Running
{ | |
"base_config": "config/tts.json", | |
"model_type": "VITS", | |
"task_type": "tts", | |
"preprocess": { | |
"extract_phone": true, | |
"extract_mel": true, | |
"n_mel": 80, | |
"fmin": 0, | |
"fmax": null, | |
"extract_linear_spec": true, | |
"extract_audio": true, | |
"use_linear": true, | |
"use_mel": true, | |
"use_audio": true, | |
"use_text": false, | |
"use_phone": true, | |
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt", | |
"n_fft": 1024, | |
"win_size": 1024, | |
"hop_size": 256, | |
"segment_size": 8192, | |
"text_cleaners": [ | |
"english_cleaners" | |
] | |
}, | |
"model": { | |
"text_token_num": 512, | |
"inter_channels": 192, | |
"hidden_channels": 192, | |
"filter_channels": 768, | |
"n_heads": 2, | |
"n_layers": 6, | |
"kernel_size": 3, | |
"p_dropout": 0.1, | |
"resblock": "1", | |
"resblock_kernel_sizes": [ | |
3, | |
7, | |
11 | |
], | |
"resblock_dilation_sizes": [ | |
[ | |
1, | |
3, | |
5 | |
], | |
[ | |
1, | |
3, | |
5 | |
], | |
[ | |
1, | |
3, | |
5 | |
] | |
], | |
"upsample_rates": [ | |
8, | |
8, | |
2, | |
2 | |
], | |
"upsample_initial_channel": 512, | |
"upsample_kernel_sizes": [ | |
16, | |
16, | |
4, | |
4 | |
], | |
"n_layers_q": 3, | |
"use_spectral_norm": false, | |
"n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true | |
"gin_channels": 256, | |
"use_sdp": true | |
}, | |
"train": { | |
"fp16_run": true, | |
"learning_rate": 2e-4, | |
"betas": [ | |
0.8, | |
0.99 | |
], | |
"eps": 1e-9, | |
"batch_size": 16, | |
"lr_decay": 0.999875, | |
// "segment_size": 8192, | |
"init_lr_ratio": 1, | |
"warmup_epochs": 0, | |
"c_mel": 45, | |
"c_kl": 1.0, | |
"AdamW": { | |
"betas": [ | |
0.8, | |
0.99 | |
], | |
"eps": 1e-9, | |
} | |
} | |
} |