token_list: - - - a - o - i - '[' - '#' - u - ']' - e - k - n - t - r - s - N - m - _ - sh - d - g - ^ - $ - w - cl - h - y - b - j - ts - ch - z - p - f - ky - ry - gy - hy - ny - by - my - py - v - dy - '?' - ty - odim: null model_conf: {} use_preprocessor: true token_type: phn bpemodel: null non_linguistic_symbols: null cleaner: jaconv g2p: pyopenjtalk_prosody feats_extract: linear_spectrogram feats_extract_conf: n_fft: 2048 hop_length: 512 win_length: null normalize: null normalize_conf: {} tts: vits tts_conf: generator_type: vits_generator generator_params: hidden_channels: 192 spks: -1 global_channels: -1 segment_size: 32 text_encoder_attention_heads: 2 text_encoder_ffn_expand: 4 text_encoder_blocks: 6 text_encoder_positionwise_layer_type: conv1d text_encoder_positionwise_conv_kernel_size: 3 text_encoder_positional_encoding_layer_type: rel_pos text_encoder_self_attention_layer_type: rel_selfattn text_encoder_activation_type: swish text_encoder_normalize_before: true text_encoder_dropout_rate: 0.1 text_encoder_positional_dropout_rate: 0.0 text_encoder_attention_dropout_rate: 0.1 use_macaron_style_in_text_encoder: true use_conformer_conv_in_text_encoder: false text_encoder_conformer_kernel_size: -1 decoder_kernel_size: 7 decoder_channels: 512 decoder_upsample_scales: - 8 - 8 - 2 - 2 - 2 decoder_upsample_kernel_sizes: - 16 - 16 - 4 - 4 - 4 decoder_resblock_kernel_sizes: - 3 - 7 - 11 decoder_resblock_dilations: - - 1 - 3 - 5 - - 1 - 3 - 5 - - 1 - 3 - 5 use_weight_norm_in_decoder: true posterior_encoder_kernel_size: 5 posterior_encoder_layers: 16 posterior_encoder_stacks: 1 posterior_encoder_base_dilation: 1 posterior_encoder_dropout_rate: 0.0 use_weight_norm_in_posterior_encoder: true flow_flows: 4 flow_kernel_size: 5 flow_base_dilation: 1 flow_layers: 4 flow_dropout_rate: 0.0 use_weight_norm_in_flow: true use_only_mean_in_flow: true stochastic_duration_predictor_kernel_size: 3 stochastic_duration_predictor_dropout_rate: 0.5 stochastic_duration_predictor_flows: 4 stochastic_duration_predictor_dds_conv_layers: 3 vocabs: 47 aux_channels: 1025 discriminator_type: hifigan_multi_scale_multi_period_discriminator discriminator_params: scales: 1 scale_downsample_pooling: AvgPool1d scale_downsample_pooling_params: kernel_size: 4 stride: 2 padding: 2 scale_discriminator_params: in_channels: 1 out_channels: 1 kernel_sizes: - 15 - 41 - 5 - 3 channels: 128 max_downsample_channels: 1024 max_groups: 16 bias: true downsample_scales: - 2 - 2 - 4 - 4 - 1 nonlinear_activation: LeakyReLU nonlinear_activation_params: negative_slope: 0.1 use_weight_norm: true use_spectral_norm: false follow_official_norm: false periods: - 2 - 3 - 5 - 7 - 11 period_discriminator_params: in_channels: 1 out_channels: 1 kernel_sizes: - 5 - 3 channels: 32 downsample_scales: - 3 - 3 - 3 - 3 - 1 max_downsample_channels: 1024 bias: true nonlinear_activation: LeakyReLU nonlinear_activation_params: negative_slope: 0.1 use_weight_norm: true use_spectral_norm: false generator_adv_loss_params: average_by_discriminators: false loss_type: mse discriminator_adv_loss_params: average_by_discriminators: false loss_type: mse feat_match_loss_params: average_by_discriminators: false average_by_layers: false include_final_outputs: true mel_loss_params: fs: 44100 n_fft: 2048 hop_length: 512 win_length: null window: hann n_mels: 80 fmin: 0 fmax: null log_base: null lambda_adv: 1.0 lambda_mel: 45.0 lambda_feat_match: 2.0 lambda_dur: 1.0 lambda_kl: 1.0 sampling_rate: 44100 cache_generator_outputs: true pitch_extract: null pitch_extract_conf: {} pitch_normalize: null pitch_normalize_conf: {} energy_extract: null energy_extract_conf: {} energy_normalize: null energy_normalize_conf: {} required: - output_dir - token_list version: '202308' distributed: false