litagin commited on
Commit
af7b5a0
1 Parent(s): cea5d7d

Upload 2 files

Browse files
Files changed (2) hide show
  1. config.yaml +232 -0
  2. pretrained.pth +3 -0
config.yaml ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ token_list:
2
+ - <blank>
3
+ - <unk>
4
+ - a
5
+ - o
6
+ - i
7
+ - '['
8
+ - '#'
9
+ - u
10
+ - ']'
11
+ - e
12
+ - k
13
+ - n
14
+ - t
15
+ - r
16
+ - s
17
+ - N
18
+ - m
19
+ - _
20
+ - sh
21
+ - d
22
+ - g
23
+ - ^
24
+ - $
25
+ - w
26
+ - cl
27
+ - h
28
+ - y
29
+ - b
30
+ - j
31
+ - ts
32
+ - ch
33
+ - z
34
+ - p
35
+ - f
36
+ - ky
37
+ - ry
38
+ - gy
39
+ - hy
40
+ - ny
41
+ - by
42
+ - my
43
+ - py
44
+ - v
45
+ - dy
46
+ - '?'
47
+ - ty
48
+ - <sos/eos>
49
+ odim: null
50
+ model_conf: {}
51
+ use_preprocessor: true
52
+ token_type: phn
53
+ bpemodel: null
54
+ non_linguistic_symbols: null
55
+ cleaner: jaconv
56
+ g2p: pyopenjtalk_prosody
57
+ feats_extract: linear_spectrogram
58
+ feats_extract_conf:
59
+ n_fft: 2048
60
+ hop_length: 512
61
+ win_length: null
62
+ normalize: null
63
+ normalize_conf: {}
64
+ tts: vits
65
+ tts_conf:
66
+ generator_type: vits_generator
67
+ generator_params:
68
+ hidden_channels: 192
69
+ spks: -1
70
+ global_channels: -1
71
+ segment_size: 32
72
+ text_encoder_attention_heads: 2
73
+ text_encoder_ffn_expand: 4
74
+ text_encoder_blocks: 6
75
+ text_encoder_positionwise_layer_type: conv1d
76
+ text_encoder_positionwise_conv_kernel_size: 3
77
+ text_encoder_positional_encoding_layer_type: rel_pos
78
+ text_encoder_self_attention_layer_type: rel_selfattn
79
+ text_encoder_activation_type: swish
80
+ text_encoder_normalize_before: true
81
+ text_encoder_dropout_rate: 0.1
82
+ text_encoder_positional_dropout_rate: 0.0
83
+ text_encoder_attention_dropout_rate: 0.1
84
+ use_macaron_style_in_text_encoder: true
85
+ use_conformer_conv_in_text_encoder: false
86
+ text_encoder_conformer_kernel_size: -1
87
+ decoder_kernel_size: 7
88
+ decoder_channels: 512
89
+ decoder_upsample_scales:
90
+ - 8
91
+ - 8
92
+ - 2
93
+ - 2
94
+ - 2
95
+ decoder_upsample_kernel_sizes:
96
+ - 16
97
+ - 16
98
+ - 4
99
+ - 4
100
+ - 4
101
+ decoder_resblock_kernel_sizes:
102
+ - 3
103
+ - 7
104
+ - 11
105
+ decoder_resblock_dilations:
106
+ - - 1
107
+ - 3
108
+ - 5
109
+ - - 1
110
+ - 3
111
+ - 5
112
+ - - 1
113
+ - 3
114
+ - 5
115
+ use_weight_norm_in_decoder: true
116
+ posterior_encoder_kernel_size: 5
117
+ posterior_encoder_layers: 16
118
+ posterior_encoder_stacks: 1
119
+ posterior_encoder_base_dilation: 1
120
+ posterior_encoder_dropout_rate: 0.0
121
+ use_weight_norm_in_posterior_encoder: true
122
+ flow_flows: 4
123
+ flow_kernel_size: 5
124
+ flow_base_dilation: 1
125
+ flow_layers: 4
126
+ flow_dropout_rate: 0.0
127
+ use_weight_norm_in_flow: true
128
+ use_only_mean_in_flow: true
129
+ stochastic_duration_predictor_kernel_size: 3
130
+ stochastic_duration_predictor_dropout_rate: 0.5
131
+ stochastic_duration_predictor_flows: 4
132
+ stochastic_duration_predictor_dds_conv_layers: 3
133
+ vocabs: 47
134
+ aux_channels: 1025
135
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
136
+ discriminator_params:
137
+ scales: 1
138
+ scale_downsample_pooling: AvgPool1d
139
+ scale_downsample_pooling_params:
140
+ kernel_size: 4
141
+ stride: 2
142
+ padding: 2
143
+ scale_discriminator_params:
144
+ in_channels: 1
145
+ out_channels: 1
146
+ kernel_sizes:
147
+ - 15
148
+ - 41
149
+ - 5
150
+ - 3
151
+ channels: 128
152
+ max_downsample_channels: 1024
153
+ max_groups: 16
154
+ bias: true
155
+ downsample_scales:
156
+ - 2
157
+ - 2
158
+ - 4
159
+ - 4
160
+ - 1
161
+ nonlinear_activation: LeakyReLU
162
+ nonlinear_activation_params:
163
+ negative_slope: 0.1
164
+ use_weight_norm: true
165
+ use_spectral_norm: false
166
+ follow_official_norm: false
167
+ periods:
168
+ - 2
169
+ - 3
170
+ - 5
171
+ - 7
172
+ - 11
173
+ period_discriminator_params:
174
+ in_channels: 1
175
+ out_channels: 1
176
+ kernel_sizes:
177
+ - 5
178
+ - 3
179
+ channels: 32
180
+ downsample_scales:
181
+ - 3
182
+ - 3
183
+ - 3
184
+ - 3
185
+ - 1
186
+ max_downsample_channels: 1024
187
+ bias: true
188
+ nonlinear_activation: LeakyReLU
189
+ nonlinear_activation_params:
190
+ negative_slope: 0.1
191
+ use_weight_norm: true
192
+ use_spectral_norm: false
193
+ generator_adv_loss_params:
194
+ average_by_discriminators: false
195
+ loss_type: mse
196
+ discriminator_adv_loss_params:
197
+ average_by_discriminators: false
198
+ loss_type: mse
199
+ feat_match_loss_params:
200
+ average_by_discriminators: false
201
+ average_by_layers: false
202
+ include_final_outputs: true
203
+ mel_loss_params:
204
+ fs: 44100
205
+ n_fft: 2048
206
+ hop_length: 512
207
+ win_length: null
208
+ window: hann
209
+ n_mels: 80
210
+ fmin: 0
211
+ fmax: null
212
+ log_base: null
213
+ lambda_adv: 1.0
214
+ lambda_mel: 45.0
215
+ lambda_feat_match: 2.0
216
+ lambda_dur: 1.0
217
+ lambda_kl: 1.0
218
+ sampling_rate: 44100
219
+ cache_generator_outputs: true
220
+ pitch_extract: null
221
+ pitch_extract_conf: {}
222
+ pitch_normalize: null
223
+ pitch_normalize_conf: {}
224
+ energy_extract: null
225
+ energy_extract_conf: {}
226
+ energy_normalize: null
227
+ energy_normalize_conf: {}
228
+ required:
229
+ - output_dir
230
+ - token_list
231
+ version: '202308'
232
+ distributed: false
pretrained.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:604d4f4cc757cc07c2402080ae0bdf80b473be30d278d32233514f6e28f685ac
3
+ size 373149238