log_dir: "" | |
save_freq: 1 | |
log_interval: 10 | |
save_interval: 1000 | |
device: "cuda" | |
epochs: 1000 # number of epochs for first stage training (pre-training) | |
batch_size: 2 | |
batch_length: 100 # maximum duration of audio in a batch (in seconds) | |
max_len: 80 # maximum number of frames | |
pretrained_model: "" | |
pretrained_encoder: "" | |
load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters | |
preprocess_params: | |
sr: 22050 | |
spect_params: | |
n_fft: 1024 | |
win_length: 1024 | |
hop_length: 256 | |
n_mels: 80 | |
model_params: | |
dit_type: "DiT" # uDiT or DiT | |
reg_loss_type: "l1" # l1 or l2 | |
speech_tokenizer: | |
type: 'facodec' | |
path: "speech_tokenizer_v1.onnx" | |
cosyvoice: | |
path: "../CosyVoice/pretrained_models/CosyVoice-300M" | |
style_encoder: | |
dim: 192 | |
campplus_path: "campplus_cn_common.bin" | |
DAC: | |
encoder_dim: 64 | |
encoder_rates: [2, 5, 5, 6] | |
decoder_dim: 1536 | |
decoder_rates: [ 6, 5, 5, 2 ] | |
sr: 24000 | |
length_regulator: | |
channels: 512 | |
is_discrete: true | |
content_codebook_size: 1024 | |
in_frame_rate: 80 | |
out_frame_rate: 80 | |
sampling_ratios: [1, 1, 1, 1] | |
token_dropout_prob: 0.3 # probability of performing token dropout | |
token_dropout_range: 1.0 # maximum percentage of tokens to drop out | |
n_codebooks: 1 | |
quantizer_dropout: 0.5 | |
f0_condition: true | |
n_f0_bins: 512 | |
DiT: | |
hidden_dim: 512 | |
num_heads: 8 | |
depth: 13 | |
class_dropout_prob: 0.1 | |
block_size: 8192 | |
in_channels: 80 | |
style_condition: true | |
final_layer_type: 'wavenet' | |
target: 'mel' # mel or codec | |
content_dim: 512 | |
content_codebook_size: 1024 | |
content_type: 'discrete' | |
f0_condition: true | |
n_f0_bins: 512 | |
content_codebooks: 3 | |
is_causal: false | |
long_skip_connection: true | |
zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token | |
time_as_token: false | |
style_as_token: false | |
uvit_skip_connection: true | |
add_resblock_in_transformer: false | |
wavenet: | |
hidden_dim: 512 | |
num_layers: 8 | |
kernel_size: 5 | |
dilation_rate: 1 | |
p_dropout: 0.2 | |
style_condition: true | |
loss_params: | |
base_lr: 0.0001 | |
lambda_mel: 45 | |
lambda_kl: 1.0 |