Yuning Wu commited on
Commit
9d99e8b
1 Parent(s): 40c5dfd

Update model

Browse files
README.md ADDED
@@ -0,0 +1 @@
 
 
1
+
exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:512c83dfde9de218b11b00b8a4685f74eb248e302b5ca69d765b78b73b666f95
3
+ size 1402
exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b5dd89bdd1c983f409124980ec3c4de47e89a87b2e5a1be09e507709936be0b
3
+ size 770
exp/visinger1/263epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:776c60dc4c9ea0c3fbc033e40c2d2ce59c70b13bf6ba9265501f86518dd83907
3
+ size 421788149
exp/visinger1/config.yaml ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/visinger1
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 300
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: -1
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: 50
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: 1000
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 500000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
72
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
73
+ valid_shape_file:
74
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
75
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 204800
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ chunk_excluded_key_prefixes: []
88
+ train_data_path_and_name_and_type:
89
+ - - dump/raw/tr_no_dev/text
90
+ - text
91
+ - text
92
+ - - dump/raw/tr_no_dev/wav.scp
93
+ - singing
94
+ - sound
95
+ - - dump/raw/tr_no_dev/label
96
+ - label
97
+ - duration
98
+ - - dump/raw/tr_no_dev/score.scp
99
+ - score
100
+ - score
101
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp
102
+ - pitch
103
+ - npy
104
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp
105
+ - feats
106
+ - npy
107
+ valid_data_path_and_name_and_type:
108
+ - - dump/raw/dev/text
109
+ - text
110
+ - text
111
+ - - dump/raw/dev/wav.scp
112
+ - singing
113
+ - sound
114
+ - - dump/raw/dev/label
115
+ - label
116
+ - duration
117
+ - - dump/raw/dev/score.scp
118
+ - score
119
+ - score
120
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp
121
+ - pitch
122
+ - npy
123
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp
124
+ - feats
125
+ - npy
126
+ allow_variable_data_keys: false
127
+ max_cache_size: 0.0
128
+ max_cache_fd: 32
129
+ valid_max_cache_size: null
130
+ exclude_weight_decay: false
131
+ exclude_weight_decay_conf: {}
132
+ optim: adamw
133
+ optim_conf:
134
+ lr: 0.0002
135
+ betas:
136
+ - 0.8
137
+ - 0.99
138
+ eps: 1.0e-09
139
+ weight_decay: 0.0
140
+ scheduler: exponentiallr
141
+ scheduler_conf:
142
+ gamma: 0.999875
143
+ optim2: adamw
144
+ optim2_conf:
145
+ lr: 0.0002
146
+ betas:
147
+ - 0.8
148
+ - 0.99
149
+ eps: 1.0e-09
150
+ weight_decay: 0.0
151
+ scheduler2: exponentiallr
152
+ scheduler2_conf:
153
+ gamma: 0.999875
154
+ generator_first: false
155
+ token_list:
156
+ - <blank>
157
+ - <unk>
158
+ - SP
159
+ - i
160
+ - AP
161
+ - e
162
+ - y
163
+ - d
164
+ - w
165
+ - sh
166
+ - ai
167
+ - n
168
+ - x
169
+ - j
170
+ - ian
171
+ - u
172
+ - l
173
+ - h
174
+ - b
175
+ - o
176
+ - zh
177
+ - an
178
+ - ou
179
+ - m
180
+ - q
181
+ - z
182
+ - en
183
+ - g
184
+ - ing
185
+ - ei
186
+ - ao
187
+ - ang
188
+ - uo
189
+ - eng
190
+ - t
191
+ - a
192
+ - ong
193
+ - ui
194
+ - k
195
+ - f
196
+ - r
197
+ - iang
198
+ - ch
199
+ - v
200
+ - in
201
+ - iao
202
+ - ie
203
+ - iu
204
+ - c
205
+ - s
206
+ - van
207
+ - p
208
+ - ve
209
+ - uan
210
+ - uang
211
+ - ia
212
+ - ua
213
+ - uai
214
+ - un
215
+ - er
216
+ - vn
217
+ - iong
218
+ - <sos/eos>
219
+ odim: null
220
+ model_conf: {}
221
+ use_preprocessor: true
222
+ token_type: phn
223
+ bpemodel: null
224
+ non_linguistic_symbols: null
225
+ cleaner: null
226
+ g2p: null
227
+ fs: 22050
228
+ score_feats_extract: syllable_score_feats
229
+ score_feats_extract_conf:
230
+ fs: 22050
231
+ n_fft: 1024
232
+ win_length: null
233
+ hop_length: 256
234
+ feats_extract: linear_spectrogram
235
+ feats_extract_conf:
236
+ n_fft: 1024
237
+ hop_length: 256
238
+ win_length: null
239
+ normalize: null
240
+ normalize_conf: {}
241
+ svs: vits
242
+ svs_conf:
243
+ generator_type: vits_generator
244
+ generator_params:
245
+ hidden_channels: 192
246
+ spks: -1
247
+ global_channels: -1
248
+ segment_size: 32
249
+ text_encoder_attention_heads: 2
250
+ text_encoder_ffn_expand: 4
251
+ text_encoder_blocks: 6
252
+ text_encoder_positionwise_layer_type: conv1d
253
+ text_encoder_positionwise_conv_kernel_size: 3
254
+ text_encoder_positional_encoding_layer_type: rel_pos
255
+ text_encoder_self_attention_layer_type: rel_selfattn
256
+ text_encoder_activation_type: swish
257
+ text_encoder_normalize_before: true
258
+ text_encoder_dropout_rate: 0.1
259
+ text_encoder_positional_dropout_rate: 0.0
260
+ text_encoder_attention_dropout_rate: 0.1
261
+ use_macaron_style_in_text_encoder: true
262
+ use_conformer_conv_in_text_encoder: false
263
+ text_encoder_conformer_kernel_size: -1
264
+ decoder_kernel_size: 7
265
+ decoder_channels: 512
266
+ decoder_upsample_scales:
267
+ - 8
268
+ - 8
269
+ - 2
270
+ - 2
271
+ decoder_upsample_kernel_sizes:
272
+ - 16
273
+ - 16
274
+ - 4
275
+ - 4
276
+ decoder_resblock_kernel_sizes:
277
+ - 3
278
+ - 7
279
+ - 11
280
+ decoder_resblock_dilations:
281
+ - - 1
282
+ - 3
283
+ - 5
284
+ - - 1
285
+ - 3
286
+ - 5
287
+ - - 1
288
+ - 3
289
+ - 5
290
+ use_weight_norm_in_decoder: true
291
+ posterior_encoder_kernel_size: 5
292
+ posterior_encoder_layers: 16
293
+ posterior_encoder_stacks: 1
294
+ posterior_encoder_base_dilation: 1
295
+ posterior_encoder_dropout_rate: 0.0
296
+ use_weight_norm_in_posterior_encoder: true
297
+ flow_flows: 4
298
+ flow_kernel_size: 5
299
+ flow_base_dilation: 1
300
+ flow_layers: 4
301
+ flow_dropout_rate: 0.0
302
+ use_weight_norm_in_flow: true
303
+ use_only_mean_in_flow: true
304
+ vocabs: 63
305
+ aux_channels: 513
306
+ use_visinger: true
307
+ use_dp: true
308
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
309
+ discriminator_params:
310
+ scales: 1
311
+ scale_downsample_pooling: AvgPool1d
312
+ scale_downsample_pooling_params:
313
+ kernel_size: 4
314
+ stride: 2
315
+ padding: 2
316
+ scale_discriminator_params:
317
+ in_channels: 1
318
+ out_channels: 1
319
+ kernel_sizes:
320
+ - 15
321
+ - 41
322
+ - 5
323
+ - 3
324
+ channels: 128
325
+ max_downsample_channels: 1024
326
+ max_groups: 16
327
+ bias: true
328
+ downsample_scales:
329
+ - 2
330
+ - 2
331
+ - 4
332
+ - 4
333
+ - 1
334
+ nonlinear_activation: LeakyReLU
335
+ nonlinear_activation_params:
336
+ negative_slope: 0.1
337
+ use_weight_norm: true
338
+ use_spectral_norm: false
339
+ follow_official_norm: false
340
+ periods:
341
+ - 2
342
+ - 3
343
+ - 5
344
+ - 7
345
+ - 11
346
+ period_discriminator_params:
347
+ in_channels: 1
348
+ out_channels: 1
349
+ kernel_sizes:
350
+ - 5
351
+ - 3
352
+ channels: 32
353
+ downsample_scales:
354
+ - 3
355
+ - 3
356
+ - 3
357
+ - 3
358
+ - 1
359
+ max_downsample_channels: 1024
360
+ bias: true
361
+ nonlinear_activation: LeakyReLU
362
+ nonlinear_activation_params:
363
+ negative_slope: 0.1
364
+ use_weight_norm: true
365
+ use_spectral_norm: false
366
+ generator_adv_loss_params:
367
+ average_by_discriminators: false
368
+ loss_type: mse
369
+ discriminator_adv_loss_params:
370
+ average_by_discriminators: false
371
+ loss_type: mse
372
+ feat_match_loss_params:
373
+ average_by_discriminators: false
374
+ average_by_layers: false
375
+ include_final_outputs: true
376
+ mel_loss_params:
377
+ fs: 22050
378
+ n_fft: 1024
379
+ hop_length: 256
380
+ win_length: null
381
+ window: hann
382
+ n_mels: 80
383
+ fmin: 0
384
+ fmax: null
385
+ log_base: null
386
+ lambda_adv: 1.0
387
+ lambda_mel: 45.0
388
+ lambda_feat_match: 2.0
389
+ lambda_dur: 0.1
390
+ lambda_pitch: 1.0
391
+ lambda_phoneme: 1.0
392
+ lambda_kl: 1.0
393
+ sampling_rate: 22050
394
+ cache_generator_outputs: true
395
+ pitch_extract: dio
396
+ pitch_extract_conf:
397
+ use_token_averaged_f0: false
398
+ fs: 22050
399
+ n_fft: 1024
400
+ hop_length: 256
401
+ f0max: 400
402
+ f0min: 80
403
+ pitch_normalize: global_mvn
404
+ pitch_normalize_conf:
405
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
406
+ energy_extract: null
407
+ energy_extract_conf: {}
408
+ energy_normalize: null
409
+ energy_normalize_conf: {}
410
+ required:
411
+ - output_dir
412
+ - token_list
413
+ version: '202301'
414
+ distributed: false
exp/visinger1/images/ctc_loss.png ADDED
exp/visinger1/images/discriminator_backward_time.png ADDED
exp/visinger1/images/discriminator_fake_loss.png ADDED
exp/visinger1/images/discriminator_forward_time.png ADDED
exp/visinger1/images/discriminator_loss.png ADDED
exp/visinger1/images/discriminator_optim_step_time.png ADDED
exp/visinger1/images/discriminator_real_loss.png ADDED
exp/visinger1/images/discriminator_train_time.png ADDED
exp/visinger1/images/generator_adv_loss.png ADDED
exp/visinger1/images/generator_backward_time.png ADDED
exp/visinger1/images/generator_dur_loss.png ADDED
exp/visinger1/images/generator_feat_match_loss.png ADDED
exp/visinger1/images/generator_forward_time.png ADDED
exp/visinger1/images/generator_kl_loss.png ADDED
exp/visinger1/images/generator_loss.png ADDED
exp/visinger1/images/generator_mel_loss.png ADDED
exp/visinger1/images/generator_optim_step_time.png ADDED
exp/visinger1/images/generator_train_time.png ADDED
exp/visinger1/images/gpu_max_cached_mem_GB.png ADDED
exp/visinger1/images/iter_time.png ADDED
exp/visinger1/images/optim0_lr0.png ADDED
exp/visinger1/images/optim1_lr0.png ADDED
exp/visinger1/images/pitch_loss.png ADDED
exp/visinger1/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202301'
2
+ files:
3
+ model_file: exp/visinger1/263epoch.pth
4
+ python: "3.7.13 (default, Mar 29 2022, 02:18:16) \n[GCC 7.5.0]"
5
+ timestamp: 1681119551.501788
6
+ torch: 1.9.0
7
+ yaml_files:
8
+ train_config: exp/visinger1/config.yaml