Monsia commited on
Commit
63ee588
1 Parent(s): 223ba00

First model version

Browse files
Files changed (27) hide show
  1. README.md +412 -1
  2. exp/tts_stats_raw_linear_spectrogram_char_tacotron/train/feats_stats.npz +0 -0
  3. exp/tts_train_vits_raw_char_tacotron/20epoch.pth +3 -0
  4. exp/tts_train_vits_raw_char_tacotron/config.yaml +336 -0
  5. exp/tts_train_vits_raw_char_tacotron/images/discriminator_backward_time.png +0 -0
  6. exp/tts_train_vits_raw_char_tacotron/images/discriminator_fake_loss.png +0 -0
  7. exp/tts_train_vits_raw_char_tacotron/images/discriminator_forward_time.png +0 -0
  8. exp/tts_train_vits_raw_char_tacotron/images/discriminator_loss.png +0 -0
  9. exp/tts_train_vits_raw_char_tacotron/images/discriminator_optim_step_time.png +0 -0
  10. exp/tts_train_vits_raw_char_tacotron/images/discriminator_real_loss.png +0 -0
  11. exp/tts_train_vits_raw_char_tacotron/images/discriminator_train_time.png +0 -0
  12. exp/tts_train_vits_raw_char_tacotron/images/generator_adv_loss.png +0 -0
  13. exp/tts_train_vits_raw_char_tacotron/images/generator_backward_time.png +0 -0
  14. exp/tts_train_vits_raw_char_tacotron/images/generator_dur_loss.png +0 -0
  15. exp/tts_train_vits_raw_char_tacotron/images/generator_feat_match_loss.png +0 -0
  16. exp/tts_train_vits_raw_char_tacotron/images/generator_forward_time.png +0 -0
  17. exp/tts_train_vits_raw_char_tacotron/images/generator_kl_loss.png +0 -0
  18. exp/tts_train_vits_raw_char_tacotron/images/generator_loss.png +0 -0
  19. exp/tts_train_vits_raw_char_tacotron/images/generator_mel_loss.png +0 -0
  20. exp/tts_train_vits_raw_char_tacotron/images/generator_optim_step_time.png +0 -0
  21. exp/tts_train_vits_raw_char_tacotron/images/generator_train_time.png +0 -0
  22. exp/tts_train_vits_raw_char_tacotron/images/gpu_max_cached_mem_GB.png +0 -0
  23. exp/tts_train_vits_raw_char_tacotron/images/iter_time.png +0 -0
  24. exp/tts_train_vits_raw_char_tacotron/images/optim0_lr0.png +0 -0
  25. exp/tts_train_vits_raw_char_tacotron/images/optim1_lr0.png +0 -0
  26. exp/tts_train_vits_raw_char_tacotron/images/train_time.png +0 -0
  27. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,414 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language:
7
+ datasets:
8
+ -
9
+ license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### ``
15
+
16
+ This model was trained by using recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+
23
+ pip install -e .
24
+ cd egs2/afrilang-bci/tts1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model
26
+ ```
27
+
28
+
29
+
30
+ ## TTS config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: ./conf/train_vits.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: sequence
40
+ output_dir: exp/44k/tts_train_vits_raw_char_tacotron
41
+ ngpu: 1
42
+ seed: 777
43
+ num_workers: 4
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: null
48
+ dist_rank: null
49
+ local_rank: 0
50
+ dist_master_addr: null
51
+ dist_master_port: null
52
+ dist_launcher: null
53
+ multiprocessing_distributed: false
54
+ unused_parameters: true
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: false
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ max_epoch: 20
62
+ patience: null
63
+ val_scheduler_criterion:
64
+ - valid
65
+ - loss
66
+ early_stopping_criterion:
67
+ - valid
68
+ - loss
69
+ - min
70
+ best_model_criterion:
71
+ - - train
72
+ - total_count
73
+ - max
74
+ keep_nbest_models: 2
75
+ nbest_averaging_interval: 0
76
+ grad_clip: -1
77
+ grad_clip_type: 2.0
78
+ grad_noise: false
79
+ accum_grad: 1
80
+ no_forward_run: false
81
+ resume: true
82
+ train_dtype: float32
83
+ use_amp: false
84
+ log_interval: 5
85
+ use_matplotlib: true
86
+ use_tensorboard: true
87
+ use_wandb: false
88
+ wandb_project: null
89
+ wandb_id: null
90
+ wandb_entity: null
91
+ wandb_name: null
92
+ wandb_model_log_interval: -1
93
+ detect_anomaly: false
94
+ pretrain_path: null
95
+ init_param: []
96
+ ignore_init_mismatch: false
97
+ freeze_param: []
98
+ num_iters_per_epoch: 20
99
+ batch_size: 20
100
+ valid_batch_size: null
101
+ batch_bins: 500
102
+ valid_batch_bins: null
103
+ train_shape_file:
104
+ - exp/44k/tts_stats_raw_linear_spectrogram_char_tacotron/train/text_shape.char
105
+ - exp/44k/tts_stats_raw_linear_spectrogram_char_tacotron/train/speech_shape
106
+ valid_shape_file:
107
+ - exp/44k/tts_stats_raw_linear_spectrogram_char_tacotron/valid/text_shape.char
108
+ - exp/44k/tts_stats_raw_linear_spectrogram_char_tacotron/valid/speech_shape
109
+ batch_type: numel
110
+ valid_batch_type: null
111
+ fold_length:
112
+ - 150
113
+ - 204800
114
+ sort_in_batch: descending
115
+ sort_batch: descending
116
+ multiple_iterator: false
117
+ chunk_length: 500
118
+ chunk_shift_ratio: 0.5
119
+ num_cache_chunks: 1024
120
+ train_data_path_and_name_and_type:
121
+ - - dump/raw/org/train/text
122
+ - text
123
+ - text
124
+ - - dump/raw/org/train/wav.scp
125
+ - speech
126
+ - sound
127
+ valid_data_path_and_name_and_type:
128
+ - - dump/raw/org/test/text
129
+ - text
130
+ - text
131
+ - - dump/raw/org/test/wav.scp
132
+ - speech
133
+ - sound
134
+ allow_variable_data_keys: false
135
+ max_cache_size: 0.0
136
+ max_cache_fd: 32
137
+ valid_max_cache_size: null
138
+ optim: adamw
139
+ optim_conf:
140
+ lr: 0.0002
141
+ betas:
142
+ - 0.8
143
+ - 0.99
144
+ eps: 1.0e-09
145
+ weight_decay: 0.0
146
+ scheduler: exponentiallr
147
+ scheduler_conf:
148
+ gamma: 0.999875
149
+ optim2: adamw
150
+ optim2_conf:
151
+ lr: 0.0002
152
+ betas:
153
+ - 0.8
154
+ - 0.99
155
+ eps: 1.0e-09
156
+ weight_decay: 0.0
157
+ scheduler2: exponentiallr
158
+ scheduler2_conf:
159
+ gamma: 0.999875
160
+ generator_first: false
161
+ token_list:
162
+ - <blank>
163
+ - <unk>
164
+ - <space>
165
+ - N
166
+ - E
167
+ - A
168
+ - I
169
+ - O
170
+ - U
171
+ - L
172
+ - K
173
+ - M
174
+ - S
175
+ - B
176
+ - W
177
+ - T
178
+ - F
179
+ - R
180
+ - Y
181
+ - Z
182
+ - D
183
+ - G
184
+ - J
185
+ - P
186
+ - C
187
+ - V
188
+ - <sos/eos>
189
+ odim: null
190
+ model_conf: {}
191
+ use_preprocessor: true
192
+ token_type: char
193
+ bpemodel: null
194
+ non_linguistic_symbols: null
195
+ cleaner: tacotron
196
+ g2p: g2p_en
197
+ feats_extract: linear_spectrogram
198
+ feats_extract_conf:
199
+ n_fft: 1024
200
+ hop_length: 256
201
+ win_length: null
202
+ normalize: null
203
+ normalize_conf: {}
204
+ tts: vits
205
+ tts_conf:
206
+ generator_type: vits_generator
207
+ generator_params:
208
+ hidden_channels: 192
209
+ spks: -1
210
+ global_channels: -1
211
+ segment_size: 32
212
+ text_encoder_attention_heads: 2
213
+ text_encoder_ffn_expand: 4
214
+ text_encoder_blocks: 6
215
+ text_encoder_positionwise_layer_type: conv1d
216
+ text_encoder_positionwise_conv_kernel_size: 3
217
+ text_encoder_positional_encoding_layer_type: rel_pos
218
+ text_encoder_self_attention_layer_type: rel_selfattn
219
+ text_encoder_activation_type: swish
220
+ text_encoder_normalize_before: true
221
+ text_encoder_dropout_rate: 0.1
222
+ text_encoder_positional_dropout_rate: 0.0
223
+ text_encoder_attention_dropout_rate: 0.1
224
+ use_macaron_style_in_text_encoder: true
225
+ use_conformer_conv_in_text_encoder: false
226
+ text_encoder_conformer_kernel_size: -1
227
+ decoder_kernel_size: 7
228
+ decoder_channels: 512
229
+ decoder_upsample_scales:
230
+ - 8
231
+ - 8
232
+ - 2
233
+ - 2
234
+ decoder_upsample_kernel_sizes:
235
+ - 16
236
+ - 16
237
+ - 4
238
+ - 4
239
+ decoder_resblock_kernel_sizes:
240
+ - 3
241
+ - 7
242
+ - 11
243
+ decoder_resblock_dilations:
244
+ - - 1
245
+ - 3
246
+ - 5
247
+ - - 1
248
+ - 3
249
+ - 5
250
+ - - 1
251
+ - 3
252
+ - 5
253
+ use_weight_norm_in_decoder: true
254
+ posterior_encoder_kernel_size: 5
255
+ posterior_encoder_layers: 16
256
+ posterior_encoder_stacks: 1
257
+ posterior_encoder_base_dilation: 1
258
+ posterior_encoder_dropout_rate: 0.0
259
+ use_weight_norm_in_posterior_encoder: true
260
+ flow_flows: 4
261
+ flow_kernel_size: 5
262
+ flow_base_dilation: 1
263
+ flow_layers: 4
264
+ flow_dropout_rate: 0.0
265
+ use_weight_norm_in_flow: true
266
+ use_only_mean_in_flow: true
267
+ stochastic_duration_predictor_kernel_size: 3
268
+ stochastic_duration_predictor_dropout_rate: 0.5
269
+ stochastic_duration_predictor_flows: 4
270
+ stochastic_duration_predictor_dds_conv_layers: 3
271
+ vocabs: 27
272
+ aux_channels: 513
273
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
274
+ discriminator_params:
275
+ scales: 1
276
+ scale_downsample_pooling: AvgPool1d
277
+ scale_downsample_pooling_params:
278
+ kernel_size: 4
279
+ stride: 2
280
+ padding: 2
281
+ scale_discriminator_params:
282
+ in_channels: 1
283
+ out_channels: 1
284
+ kernel_sizes:
285
+ - 15
286
+ - 41
287
+ - 5
288
+ - 3
289
+ channels: 128
290
+ max_downsample_channels: 1024
291
+ max_groups: 16
292
+ bias: true
293
+ downsample_scales:
294
+ - 2
295
+ - 2
296
+ - 4
297
+ - 4
298
+ - 1
299
+ nonlinear_activation: LeakyReLU
300
+ nonlinear_activation_params:
301
+ negative_slope: 0.1
302
+ use_weight_norm: true
303
+ use_spectral_norm: false
304
+ follow_official_norm: false
305
+ periods:
306
+ - 2
307
+ - 3
308
+ - 5
309
+ - 7
310
+ - 11
311
+ period_discriminator_params:
312
+ in_channels: 1
313
+ out_channels: 1
314
+ kernel_sizes:
315
+ - 5
316
+ - 3
317
+ channels: 32
318
+ downsample_scales:
319
+ - 3
320
+ - 3
321
+ - 3
322
+ - 3
323
+ - 1
324
+ max_downsample_channels: 1024
325
+ bias: true
326
+ nonlinear_activation: LeakyReLU
327
+ nonlinear_activation_params:
328
+ negative_slope: 0.1
329
+ use_weight_norm: true
330
+ use_spectral_norm: false
331
+ generator_adv_loss_params:
332
+ average_by_discriminators: false
333
+ loss_type: mse
334
+ discriminator_adv_loss_params:
335
+ average_by_discriminators: false
336
+ loss_type: mse
337
+ feat_match_loss_params:
338
+ average_by_discriminators: false
339
+ average_by_layers: false
340
+ include_final_outputs: true
341
+ mel_loss_params:
342
+ fs: 44100
343
+ n_fft: 1024
344
+ hop_length: 256
345
+ win_length: null
346
+ window: hann
347
+ n_mels: 80
348
+ fmin: 0
349
+ fmax: null
350
+ log_base: null
351
+ lambda_adv: 1.0
352
+ lambda_mel: 45.0
353
+ lambda_feat_match: 2.0
354
+ lambda_dur: 1.0
355
+ lambda_kl: 1.0
356
+ sampling_rate: 44100
357
+ cache_generator_outputs: true
358
+ pitch_extract: null
359
+ pitch_extract_conf: {}
360
+ pitch_normalize: null
361
+ pitch_normalize_conf: {}
362
+ energy_extract: null
363
+ energy_extract_conf: {}
364
+ energy_normalize: null
365
+ energy_normalize_conf: {}
366
+ required:
367
+ - output_dir
368
+ - token_list
369
+ version: '202204'
370
+ distributed: false
371
+ ```
372
+
373
+ </details>
374
+
375
+
376
+
377
+ ### Citing ESPnet
378
+
379
+ ```BibTex
380
+ @inproceedings{watanabe2018espnet,
381
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
382
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
383
+ year={2018},
384
+ booktitle={Proceedings of Interspeech},
385
+ pages={2207--2211},
386
+ doi={10.21437/Interspeech.2018-1456},
387
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
388
+ }
389
+
390
+
391
+
392
+
393
+ @inproceedings{hayashi2020espnet,
394
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
395
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
396
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
397
+ pages={7654--7658},
398
+ year={2020},
399
+ organization={IEEE}
400
+ }
401
+ ```
402
+
403
+ or arXiv:
404
+
405
+ ```bibtex
406
+ @misc{watanabe2018espnet,
407
+ title={ESPnet: End-to-End Speech Processing Toolkit},
408
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
409
+ year={2018},
410
+ eprint={1804.00015},
411
+ archivePrefix={arXiv},
412
+ primaryClass={cs.CL}
413
+ }
414
+ ```
exp/tts_stats_raw_linear_spectrogram_char_tacotron/train/feats_stats.npz ADDED
Binary file (4.87 kB). View file
 
exp/tts_train_vits_raw_char_tacotron/20epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0918ea7e8df4170f51ff10dd0b8b6b9eae576fa288d63690c187cfa00389401
3
+ size 372510721
exp/tts_train_vits_raw_char_tacotron/config.yaml ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/train_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/44k/tts_train_vits_raw_char_tacotron
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 20
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 2
41
+ nbest_averaging_interval: 0
42
+ grad_clip: -1
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: 5
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ wandb_entity: null
57
+ wandb_name: null
58
+ wandb_model_log_interval: -1
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param: []
62
+ ignore_init_mismatch: false
63
+ freeze_param: []
64
+ num_iters_per_epoch: 20
65
+ batch_size: 20
66
+ valid_batch_size: null
67
+ batch_bins: 500
68
+ valid_batch_bins: null
69
+ train_shape_file:
70
+ - exp/44k/tts_stats_raw_linear_spectrogram_char_tacotron/train/text_shape.char
71
+ - exp/44k/tts_stats_raw_linear_spectrogram_char_tacotron/train/speech_shape
72
+ valid_shape_file:
73
+ - exp/44k/tts_stats_raw_linear_spectrogram_char_tacotron/valid/text_shape.char
74
+ - exp/44k/tts_stats_raw_linear_spectrogram_char_tacotron/valid/speech_shape
75
+ batch_type: numel
76
+ valid_batch_type: null
77
+ fold_length:
78
+ - 150
79
+ - 204800
80
+ sort_in_batch: descending
81
+ sort_batch: descending
82
+ multiple_iterator: false
83
+ chunk_length: 500
84
+ chunk_shift_ratio: 0.5
85
+ num_cache_chunks: 1024
86
+ train_data_path_and_name_and_type:
87
+ - - dump/raw/org/train/text
88
+ - text
89
+ - text
90
+ - - dump/raw/org/train/wav.scp
91
+ - speech
92
+ - sound
93
+ valid_data_path_and_name_and_type:
94
+ - - dump/raw/org/test/text
95
+ - text
96
+ - text
97
+ - - dump/raw/org/test/wav.scp
98
+ - speech
99
+ - sound
100
+ allow_variable_data_keys: false
101
+ max_cache_size: 0.0
102
+ max_cache_fd: 32
103
+ valid_max_cache_size: null
104
+ optim: adamw
105
+ optim_conf:
106
+ lr: 0.0002
107
+ betas:
108
+ - 0.8
109
+ - 0.99
110
+ eps: 1.0e-09
111
+ weight_decay: 0.0
112
+ scheduler: exponentiallr
113
+ scheduler_conf:
114
+ gamma: 0.999875
115
+ optim2: adamw
116
+ optim2_conf:
117
+ lr: 0.0002
118
+ betas:
119
+ - 0.8
120
+ - 0.99
121
+ eps: 1.0e-09
122
+ weight_decay: 0.0
123
+ scheduler2: exponentiallr
124
+ scheduler2_conf:
125
+ gamma: 0.999875
126
+ generator_first: false
127
+ token_list:
128
+ - <blank>
129
+ - <unk>
130
+ - <space>
131
+ - N
132
+ - E
133
+ - A
134
+ - I
135
+ - O
136
+ - U
137
+ - L
138
+ - K
139
+ - M
140
+ - S
141
+ - B
142
+ - W
143
+ - T
144
+ - F
145
+ - R
146
+ - Y
147
+ - Z
148
+ - D
149
+ - G
150
+ - J
151
+ - P
152
+ - C
153
+ - V
154
+ - <sos/eos>
155
+ odim: null
156
+ model_conf: {}
157
+ use_preprocessor: true
158
+ token_type: char
159
+ bpemodel: null
160
+ non_linguistic_symbols: null
161
+ cleaner: tacotron
162
+ g2p: g2p_en
163
+ feats_extract: linear_spectrogram
164
+ feats_extract_conf:
165
+ n_fft: 1024
166
+ hop_length: 256
167
+ win_length: null
168
+ normalize: null
169
+ normalize_conf: {}
170
+ tts: vits
171
+ tts_conf:
172
+ generator_type: vits_generator
173
+ generator_params:
174
+ hidden_channels: 192
175
+ spks: -1
176
+ global_channels: -1
177
+ segment_size: 32
178
+ text_encoder_attention_heads: 2
179
+ text_encoder_ffn_expand: 4
180
+ text_encoder_blocks: 6
181
+ text_encoder_positionwise_layer_type: conv1d
182
+ text_encoder_positionwise_conv_kernel_size: 3
183
+ text_encoder_positional_encoding_layer_type: rel_pos
184
+ text_encoder_self_attention_layer_type: rel_selfattn
185
+ text_encoder_activation_type: swish
186
+ text_encoder_normalize_before: true
187
+ text_encoder_dropout_rate: 0.1
188
+ text_encoder_positional_dropout_rate: 0.0
189
+ text_encoder_attention_dropout_rate: 0.1
190
+ use_macaron_style_in_text_encoder: true
191
+ use_conformer_conv_in_text_encoder: false
192
+ text_encoder_conformer_kernel_size: -1
193
+ decoder_kernel_size: 7
194
+ decoder_channels: 512
195
+ decoder_upsample_scales:
196
+ - 8
197
+ - 8
198
+ - 2
199
+ - 2
200
+ decoder_upsample_kernel_sizes:
201
+ - 16
202
+ - 16
203
+ - 4
204
+ - 4
205
+ decoder_resblock_kernel_sizes:
206
+ - 3
207
+ - 7
208
+ - 11
209
+ decoder_resblock_dilations:
210
+ - - 1
211
+ - 3
212
+ - 5
213
+ - - 1
214
+ - 3
215
+ - 5
216
+ - - 1
217
+ - 3
218
+ - 5
219
+ use_weight_norm_in_decoder: true
220
+ posterior_encoder_kernel_size: 5
221
+ posterior_encoder_layers: 16
222
+ posterior_encoder_stacks: 1
223
+ posterior_encoder_base_dilation: 1
224
+ posterior_encoder_dropout_rate: 0.0
225
+ use_weight_norm_in_posterior_encoder: true
226
+ flow_flows: 4
227
+ flow_kernel_size: 5
228
+ flow_base_dilation: 1
229
+ flow_layers: 4
230
+ flow_dropout_rate: 0.0
231
+ use_weight_norm_in_flow: true
232
+ use_only_mean_in_flow: true
233
+ stochastic_duration_predictor_kernel_size: 3
234
+ stochastic_duration_predictor_dropout_rate: 0.5
235
+ stochastic_duration_predictor_flows: 4
236
+ stochastic_duration_predictor_dds_conv_layers: 3
237
+ vocabs: 27
238
+ aux_channels: 513
239
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
240
+ discriminator_params:
241
+ scales: 1
242
+ scale_downsample_pooling: AvgPool1d
243
+ scale_downsample_pooling_params:
244
+ kernel_size: 4
245
+ stride: 2
246
+ padding: 2
247
+ scale_discriminator_params:
248
+ in_channels: 1
249
+ out_channels: 1
250
+ kernel_sizes:
251
+ - 15
252
+ - 41
253
+ - 5
254
+ - 3
255
+ channels: 128
256
+ max_downsample_channels: 1024
257
+ max_groups: 16
258
+ bias: true
259
+ downsample_scales:
260
+ - 2
261
+ - 2
262
+ - 4
263
+ - 4
264
+ - 1
265
+ nonlinear_activation: LeakyReLU
266
+ nonlinear_activation_params:
267
+ negative_slope: 0.1
268
+ use_weight_norm: true
269
+ use_spectral_norm: false
270
+ follow_official_norm: false
271
+ periods:
272
+ - 2
273
+ - 3
274
+ - 5
275
+ - 7
276
+ - 11
277
+ period_discriminator_params:
278
+ in_channels: 1
279
+ out_channels: 1
280
+ kernel_sizes:
281
+ - 5
282
+ - 3
283
+ channels: 32
284
+ downsample_scales:
285
+ - 3
286
+ - 3
287
+ - 3
288
+ - 3
289
+ - 1
290
+ max_downsample_channels: 1024
291
+ bias: true
292
+ nonlinear_activation: LeakyReLU
293
+ nonlinear_activation_params:
294
+ negative_slope: 0.1
295
+ use_weight_norm: true
296
+ use_spectral_norm: false
297
+ generator_adv_loss_params:
298
+ average_by_discriminators: false
299
+ loss_type: mse
300
+ discriminator_adv_loss_params:
301
+ average_by_discriminators: false
302
+ loss_type: mse
303
+ feat_match_loss_params:
304
+ average_by_discriminators: false
305
+ average_by_layers: false
306
+ include_final_outputs: true
307
+ mel_loss_params:
308
+ fs: 44100
309
+ n_fft: 1024
310
+ hop_length: 256
311
+ win_length: null
312
+ window: hann
313
+ n_mels: 80
314
+ fmin: 0
315
+ fmax: null
316
+ log_base: null
317
+ lambda_adv: 1.0
318
+ lambda_mel: 45.0
319
+ lambda_feat_match: 2.0
320
+ lambda_dur: 1.0
321
+ lambda_kl: 1.0
322
+ sampling_rate: 44100
323
+ cache_generator_outputs: true
324
+ pitch_extract: null
325
+ pitch_extract_conf: {}
326
+ pitch_normalize: null
327
+ pitch_normalize_conf: {}
328
+ energy_extract: null
329
+ energy_extract_conf: {}
330
+ energy_normalize: null
331
+ energy_normalize_conf: {}
332
+ required:
333
+ - output_dir
334
+ - token_list
335
+ version: '202204'
336
+ distributed: false
exp/tts_train_vits_raw_char_tacotron/images/discriminator_backward_time.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/discriminator_fake_loss.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/discriminator_forward_time.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/discriminator_loss.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/discriminator_optim_step_time.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/discriminator_real_loss.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/discriminator_train_time.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/generator_adv_loss.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/generator_backward_time.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/generator_dur_loss.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/generator_feat_match_loss.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/generator_forward_time.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/generator_kl_loss.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/generator_loss.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/generator_mel_loss.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/generator_optim_step_time.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/generator_train_time.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/iter_time.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/optim0_lr0.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/optim1_lr0.png ADDED
exp/tts_train_vits_raw_char_tacotron/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202204'
2
+ files:
3
+ model_file: exp/44k/tts_train_vits_raw_char_tacotron/20epoch.pth
4
+ python: "3.7.13 (default, Apr 24 2022, 01:04:09) \n[GCC 7.5.0]"
5
+ timestamp: 1653400662.766302
6
+ torch: 1.10.1+cu111
7
+ yaml_files:
8
+ train_config: exp/44k/tts_train_vits_raw_char_tacotron/config.yaml