Li-Jen commited on
Commit
79749c8
1 Parent(s): 5fc2262

Update model

Browse files
README.md CHANGED
@@ -35,7 +35,427 @@ cd egs2/chatbot/tts1
35
  <details><summary>expand</summary>
36
 
37
  ```
38
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  ```
40
 
41
  </details>
 
35
  <details><summary>expand</summary>
36
 
37
  ```
38
+ config: conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: sequence
43
+ output_dir: exp/tts_finetune_fastpeech2_g2pw
44
+ ngpu: 1
45
+ seed: 0
46
+ num_workers: 1
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: null
51
+ dist_rank: null
52
+ local_rank: 0
53
+ dist_master_addr: null
54
+ dist_master_port: null
55
+ dist_launcher: null
56
+ multiprocessing_distributed: false
57
+ unused_parameters: false
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: true
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 100
65
+ patience: null
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - loss
76
+ - min
77
+ - - train
78
+ - loss
79
+ - min
80
+ keep_nbest_models: 5
81
+ nbest_averaging_interval: 0
82
+ grad_clip: 1.0
83
+ grad_clip_type: 2.0
84
+ grad_noise: false
85
+ accum_grad: 1
86
+ no_forward_run: false
87
+ resume: true
88
+ train_dtype: float32
89
+ use_amp: false
90
+ log_interval: null
91
+ use_matplotlib: true
92
+ use_tensorboard: true
93
+ create_graph_in_tensorboard: false
94
+ use_wandb: false
95
+ wandb_project: null
96
+ wandb_id: null
97
+ wandb_entity: null
98
+ wandb_name: null
99
+ wandb_model_log_interval: -1
100
+ detect_anomaly: false
101
+ pretrain_path: null
102
+ init_param:
103
+ - ../../aishell3/tts1/exp/tts_train_gst+xvector_conformer_fastspeech2_raw_phn_pypinyin_g2p_phone/train.loss.ave_5best.pth:tts:tts
104
+ ignore_init_mismatch: false
105
+ freeze_param: []
106
+ num_iters_per_epoch: null
107
+ batch_size: 50
108
+ valid_batch_size: null
109
+ batch_bins: 1000000
110
+ valid_batch_bins: null
111
+ train_shape_file:
112
+ - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/text_shape.phn
113
+ - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/speech_shape
114
+ valid_shape_file:
115
+ - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/valid/text_shape.phn
116
+ - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/valid/speech_shape
117
+ batch_type: sorted
118
+ valid_batch_type: null
119
+ fold_length:
120
+ - 150
121
+ - 240000
122
+ sort_in_batch: descending
123
+ sort_batch: descending
124
+ multiple_iterator: false
125
+ chunk_length: 500
126
+ chunk_shift_ratio: 0.5
127
+ num_cache_chunks: 1024
128
+ train_data_path_and_name_and_type:
129
+ - - dump/raw/train_no_dev_test_phn/text
130
+ - text
131
+ - text
132
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/decode_use_teacher_forcingtrue_train.loss.ave//train_no_dev_test_phn/durations
133
+ - durations
134
+ - text_int
135
+ - - dump/raw/train_no_dev_test_phn/wav.scp
136
+ - speech
137
+ - sound
138
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/collect_feats/pitch.scp
139
+ - pitch
140
+ - npy
141
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/collect_feats/energy.scp
142
+ - energy
143
+ - npy
144
+ - - dump/xvector/train_no_dev_test_phn/xvector.scp
145
+ - spembs
146
+ - kaldi_ark
147
+ valid_data_path_and_name_and_type:
148
+ - - dump/raw/dev_phn/text
149
+ - text
150
+ - text
151
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/decode_use_teacher_forcingtrue_train.loss.ave//dev_phn/durations
152
+ - durations
153
+ - text_int
154
+ - - dump/raw/dev_phn/wav.scp
155
+ - speech
156
+ - sound
157
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/valid/collect_feats/pitch.scp
158
+ - pitch
159
+ - npy
160
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/valid/collect_feats/energy.scp
161
+ - energy
162
+ - npy
163
+ - - dump/xvector/dev_phn/xvector.scp
164
+ - spembs
165
+ - kaldi_ark
166
+ allow_variable_data_keys: false
167
+ max_cache_size: 0.0
168
+ max_cache_fd: 32
169
+ valid_max_cache_size: null
170
+ optim: adam
171
+ optim_conf:
172
+ lr: 1.0
173
+ scheduler: noamlr
174
+ scheduler_conf:
175
+ model_size: 384
176
+ warmup_steps: 2000
177
+ token_list:
178
+ - <blank>
179
+ - <unk>
180
+ - d
181
+ - sh
182
+ - j
183
+ - i4
184
+ - zh
185
+ - l
186
+ - x
187
+ - e
188
+ - b
189
+ - g
190
+ - i1
191
+ - h
192
+ - q
193
+ - m
194
+ - t
195
+ - i2
196
+ - u4
197
+ - z
198
+ - ch
199
+ - i3
200
+ - f
201
+ - s
202
+ - n
203
+ - iou3
204
+ - r
205
+ - ian4
206
+ - ong1
207
+ - uei4
208
+ - e4
209
+ - en2
210
+ - ai4
211
+ - k
212
+ - ing2
213
+ - a1
214
+ - uo3
215
+ - u3
216
+ - ao4
217
+ - p
218
+ - an1
219
+ - eng2
220
+ - e2
221
+ - in1
222
+ - c
223
+ - ai2
224
+ - an4
225
+ - ian2
226
+ - u2
227
+ - ang4
228
+ - ian1
229
+ - ai3
230
+ - ing1
231
+ - ao3
232
+ - uo4
233
+ - ian3
234
+ - ing4
235
+ - ü4
236
+ - ang1
237
+ - u1
238
+ - iao4
239
+ - eng1
240
+ - iou4
241
+ - a4
242
+ - üan2
243
+ - ie4
244
+ - ou4
245
+ - er4
246
+ - en1
247
+ - ong2
248
+ - e1
249
+ - an3
250
+ - ei4
251
+ - uo2
252
+ - ou3
253
+ - ang2
254
+ - iang4
255
+ - ou1
256
+ - ang3
257
+ - an2
258
+ - eng4
259
+ - ong4
260
+ - uan4
261
+ - a3
262
+ - ia4
263
+ - ia1
264
+ - iao1
265
+ - iang1
266
+ - iou2
267
+ - uo1
268
+ - ei3
269
+ - iao3
270
+ - in4
271
+ - e3
272
+ - ü3
273
+ - iang3
274
+ - uei2
275
+ - en3
276
+ - uan1
277
+ - ie3
278
+ - ao1
279
+ - ai1
280
+ - üe4
281
+ - ü2
282
+ - ing3
283
+ - en4
284
+ - uei1
285
+ - er2
286
+ - uan3
287
+ - ü1
288
+ - in3
289
+ - en
290
+ - üe2
291
+ - ie2
292
+ - ei2
293
+ - ua4
294
+ - uan2
295
+ - in2
296
+ - a2
297
+ - ie1
298
+ - iang2
299
+ - ou2
300
+ - ong3
301
+ - uang3
302
+ - eng3
303
+ - uen1
304
+ - uai4
305
+ - ün4
306
+ - uang4
307
+ - uei3
308
+ - uen2
309
+ - uen4
310
+ - i
311
+ - iong4
312
+ - v3
313
+ - iao2
314
+ - üan4
315
+ - uang1
316
+ - ei1
317
+ - o2
318
+ - iou1
319
+ - uang2
320
+ - a
321
+ - ao2
322
+ - o1
323
+ - ua2
324
+ - uen3
325
+ - ua1
326
+ - v4
327
+ - üan3
328
+ - ün1
329
+ - üe1
330
+ - ün2
331
+ - o4
332
+ - er3
333
+ - iong3
334
+ - üan1
335
+ - ia3
336
+ - ia2
337
+ - iong1
338
+ - üe3
339
+ - ve4
340
+ - iong2
341
+ - uai2
342
+ - er
343
+ - ua3
344
+ - uai1
345
+ - ou
346
+ - ün3
347
+ - uai3
348
+ - ia
349
+ - uo
350
+ - o3
351
+ - v2
352
+ - ueng1
353
+ - o
354
+ - ei
355
+ - ua
356
+ - io1
357
+ - <sos/eos>
358
+ odim: null
359
+ model_conf: {}
360
+ use_preprocessor: true
361
+ token_type: phn
362
+ bpemodel: null
363
+ non_linguistic_symbols: null
364
+ cleaner: null
365
+ g2p: null
366
+ feats_extract: fbank
367
+ feats_extract_conf:
368
+ n_fft: 2048
369
+ hop_length: 300
370
+ win_length: 1200
371
+ fs: 24000
372
+ fmin: 80
373
+ fmax: 7600
374
+ n_mels: 80
375
+ normalize: global_mvn
376
+ normalize_conf:
377
+ stats_file: exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/feats_stats.npz
378
+ tts: fastspeech2
379
+ tts_conf:
380
+ adim: 384
381
+ aheads: 2
382
+ elayers: 4
383
+ eunits: 1536
384
+ dlayers: 4
385
+ dunits: 1536
386
+ positionwise_layer_type: conv1d
387
+ positionwise_conv_kernel_size: 3
388
+ duration_predictor_layers: 2
389
+ duration_predictor_chans: 256
390
+ duration_predictor_kernel_size: 3
391
+ postnet_layers: 5
392
+ postnet_filts: 5
393
+ postnet_chans: 256
394
+ use_masking: true
395
+ encoder_normalize_before: true
396
+ decoder_normalize_before: true
397
+ reduction_factor: 1
398
+ encoder_type: conformer
399
+ decoder_type: conformer
400
+ conformer_pos_enc_layer_type: rel_pos
401
+ conformer_self_attn_layer_type: rel_selfattn
402
+ conformer_activation_type: swish
403
+ use_macaron_style_in_conformer: true
404
+ use_cnn_in_conformer: true
405
+ conformer_enc_kernel_size: 7
406
+ conformer_dec_kernel_size: 31
407
+ init_type: xavier_uniform
408
+ transformer_enc_dropout_rate: 0.2
409
+ transformer_enc_positional_dropout_rate: 0.2
410
+ transformer_enc_attn_dropout_rate: 0.2
411
+ transformer_dec_dropout_rate: 0.2
412
+ transformer_dec_positional_dropout_rate: 0.2
413
+ transformer_dec_attn_dropout_rate: 0.2
414
+ pitch_predictor_layers: 5
415
+ pitch_predictor_chans: 256
416
+ pitch_predictor_kernel_size: 5
417
+ pitch_predictor_dropout: 0.5
418
+ pitch_embed_kernel_size: 1
419
+ pitch_embed_dropout: 0.0
420
+ stop_gradient_from_pitch_predictor: true
421
+ energy_predictor_layers: 2
422
+ energy_predictor_chans: 256
423
+ energy_predictor_kernel_size: 3
424
+ energy_predictor_dropout: 0.5
425
+ energy_embed_kernel_size: 1
426
+ energy_embed_dropout: 0.0
427
+ stop_gradient_from_energy_predictor: false
428
+ spk_embed_dim: 512
429
+ spk_embed_integration_type: add
430
+ use_gst: true
431
+ gst_heads: 4
432
+ gst_tokens: 16
433
+ pitch_extract: dio
434
+ pitch_extract_conf:
435
+ fs: 24000
436
+ n_fft: 2048
437
+ hop_length: 300
438
+ f0max: 400
439
+ f0min: 80
440
+ reduction_factor: 1
441
+ pitch_normalize: global_mvn
442
+ pitch_normalize_conf:
443
+ stats_file: exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/pitch_stats.npz
444
+ energy_extract: energy
445
+ energy_extract_conf:
446
+ fs: 24000
447
+ n_fft: 2048
448
+ hop_length: 300
449
+ win_length: 1200
450
+ reduction_factor: 1
451
+ energy_normalize: global_mvn
452
+ energy_normalize_conf:
453
+ stats_file: exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/energy_stats.npz
454
+ required:
455
+ - output_dir
456
+ - token_list
457
+ version: '202211'
458
+ distributed: false
459
  ```
460
 
461
  </details>
dump/xvector/dev_phn/spk_xvector.ark ADDED
Binary file (2.07 kB). View file
 
dump/xvector/dev_phn/spk_xvector.scp ADDED
@@ -0,0 +1 @@
 
 
1
+ chatbot dump/xvector/dev_phn/spk_xvector.ark:8
dump/xvector/test_phn/spk_xvector.ark ADDED
Binary file (2.07 kB). View file
 
dump/xvector/test_phn/spk_xvector.scp ADDED
@@ -0,0 +1 @@
 
 
1
+ chatbot dump/xvector/test_phn/spk_xvector.ark:8
dump/xvector/train_no_dev_test_phn/spk_xvector.ark ADDED
Binary file (2.07 kB). View file
 
dump/xvector/train_no_dev_test_phn/spk_xvector.scp ADDED
@@ -0,0 +1 @@
 
 
1
+ chatbot dump/xvector/train_no_dev_test_phn/spk_xvector.ark:8
exp/tts_finetune_fastpeech2_g2pw/config.yaml ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_finetune_fastpeech2_g2pw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ create_graph_in_tensorboard: false
57
+ use_wandb: false
58
+ wandb_project: null
59
+ wandb_id: null
60
+ wandb_entity: null
61
+ wandb_name: null
62
+ wandb_model_log_interval: -1
63
+ detect_anomaly: false
64
+ pretrain_path: null
65
+ init_param:
66
+ - ../../aishell3/tts1/exp/tts_train_gst+xvector_conformer_fastspeech2_raw_phn_pypinyin_g2p_phone/train.loss.ave_5best.pth:tts:tts
67
+ ignore_init_mismatch: false
68
+ freeze_param: []
69
+ num_iters_per_epoch: null
70
+ batch_size: 50
71
+ valid_batch_size: null
72
+ batch_bins: 1000000
73
+ valid_batch_bins: null
74
+ train_shape_file:
75
+ - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/text_shape.phn
76
+ - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/speech_shape
77
+ valid_shape_file:
78
+ - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/valid/text_shape.phn
79
+ - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/valid/speech_shape
80
+ batch_type: sorted
81
+ valid_batch_type: null
82
+ fold_length:
83
+ - 150
84
+ - 240000
85
+ sort_in_batch: descending
86
+ sort_batch: descending
87
+ multiple_iterator: false
88
+ chunk_length: 500
89
+ chunk_shift_ratio: 0.5
90
+ num_cache_chunks: 1024
91
+ train_data_path_and_name_and_type:
92
+ - - dump/raw/train_no_dev_test_phn/text
93
+ - text
94
+ - text
95
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/decode_use_teacher_forcingtrue_train.loss.ave//train_no_dev_test_phn/durations
96
+ - durations
97
+ - text_int
98
+ - - dump/raw/train_no_dev_test_phn/wav.scp
99
+ - speech
100
+ - sound
101
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/collect_feats/pitch.scp
102
+ - pitch
103
+ - npy
104
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/collect_feats/energy.scp
105
+ - energy
106
+ - npy
107
+ - - dump/xvector/train_no_dev_test_phn/xvector.scp
108
+ - spembs
109
+ - kaldi_ark
110
+ valid_data_path_and_name_and_type:
111
+ - - dump/raw/dev_phn/text
112
+ - text
113
+ - text
114
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/decode_use_teacher_forcingtrue_train.loss.ave//dev_phn/durations
115
+ - durations
116
+ - text_int
117
+ - - dump/raw/dev_phn/wav.scp
118
+ - speech
119
+ - sound
120
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/valid/collect_feats/pitch.scp
121
+ - pitch
122
+ - npy
123
+ - - exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/valid/collect_feats/energy.scp
124
+ - energy
125
+ - npy
126
+ - - dump/xvector/dev_phn/xvector.scp
127
+ - spembs
128
+ - kaldi_ark
129
+ allow_variable_data_keys: false
130
+ max_cache_size: 0.0
131
+ max_cache_fd: 32
132
+ valid_max_cache_size: null
133
+ optim: adam
134
+ optim_conf:
135
+ lr: 1.0
136
+ scheduler: noamlr
137
+ scheduler_conf:
138
+ model_size: 384
139
+ warmup_steps: 2000
140
+ token_list:
141
+ - <blank>
142
+ - <unk>
143
+ - d
144
+ - sh
145
+ - j
146
+ - i4
147
+ - zh
148
+ - l
149
+ - x
150
+ - e
151
+ - b
152
+ - g
153
+ - i1
154
+ - h
155
+ - q
156
+ - m
157
+ - t
158
+ - i2
159
+ - u4
160
+ - z
161
+ - ch
162
+ - i3
163
+ - f
164
+ - s
165
+ - n
166
+ - iou3
167
+ - r
168
+ - ian4
169
+ - ong1
170
+ - uei4
171
+ - e4
172
+ - en2
173
+ - ai4
174
+ - k
175
+ - ing2
176
+ - a1
177
+ - uo3
178
+ - u3
179
+ - ao4
180
+ - p
181
+ - an1
182
+ - eng2
183
+ - e2
184
+ - in1
185
+ - c
186
+ - ai2
187
+ - an4
188
+ - ian2
189
+ - u2
190
+ - ang4
191
+ - ian1
192
+ - ai3
193
+ - ing1
194
+ - ao3
195
+ - uo4
196
+ - ian3
197
+ - ing4
198
+ - ü4
199
+ - ang1
200
+ - u1
201
+ - iao4
202
+ - eng1
203
+ - iou4
204
+ - a4
205
+ - üan2
206
+ - ie4
207
+ - ou4
208
+ - er4
209
+ - en1
210
+ - ong2
211
+ - e1
212
+ - an3
213
+ - ei4
214
+ - uo2
215
+ - ou3
216
+ - ang2
217
+ - iang4
218
+ - ou1
219
+ - ang3
220
+ - an2
221
+ - eng4
222
+ - ong4
223
+ - uan4
224
+ - a3
225
+ - ia4
226
+ - ia1
227
+ - iao1
228
+ - iang1
229
+ - iou2
230
+ - uo1
231
+ - ei3
232
+ - iao3
233
+ - in4
234
+ - e3
235
+ - ü3
236
+ - iang3
237
+ - uei2
238
+ - en3
239
+ - uan1
240
+ - ie3
241
+ - ao1
242
+ - ai1
243
+ - üe4
244
+ - ü2
245
+ - ing3
246
+ - en4
247
+ - uei1
248
+ - er2
249
+ - uan3
250
+ - ü1
251
+ - in3
252
+ - en
253
+ - üe2
254
+ - ie2
255
+ - ei2
256
+ - ua4
257
+ - uan2
258
+ - in2
259
+ - a2
260
+ - ie1
261
+ - iang2
262
+ - ou2
263
+ - ong3
264
+ - uang3
265
+ - eng3
266
+ - uen1
267
+ - uai4
268
+ - ün4
269
+ - uang4
270
+ - uei3
271
+ - uen2
272
+ - uen4
273
+ - i
274
+ - iong4
275
+ - v3
276
+ - iao2
277
+ - üan4
278
+ - uang1
279
+ - ei1
280
+ - o2
281
+ - iou1
282
+ - uang2
283
+ - a
284
+ - ao2
285
+ - o1
286
+ - ua2
287
+ - uen3
288
+ - ua1
289
+ - v4
290
+ - üan3
291
+ - ün1
292
+ - üe1
293
+ - ün2
294
+ - o4
295
+ - er3
296
+ - iong3
297
+ - üan1
298
+ - ia3
299
+ - ia2
300
+ - iong1
301
+ - üe3
302
+ - ve4
303
+ - iong2
304
+ - uai2
305
+ - er
306
+ - ua3
307
+ - uai1
308
+ - ou
309
+ - ün3
310
+ - uai3
311
+ - ia
312
+ - uo
313
+ - o3
314
+ - v2
315
+ - ueng1
316
+ - o
317
+ - ei
318
+ - ua
319
+ - io1
320
+ - <sos/eos>
321
+ odim: null
322
+ model_conf: {}
323
+ use_preprocessor: true
324
+ token_type: phn
325
+ bpemodel: null
326
+ non_linguistic_symbols: null
327
+ cleaner: null
328
+ g2p: null
329
+ feats_extract: fbank
330
+ feats_extract_conf:
331
+ n_fft: 2048
332
+ hop_length: 300
333
+ win_length: 1200
334
+ fs: 24000
335
+ fmin: 80
336
+ fmax: 7600
337
+ n_mels: 80
338
+ normalize: global_mvn
339
+ normalize_conf:
340
+ stats_file: exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/feats_stats.npz
341
+ tts: fastspeech2
342
+ tts_conf:
343
+ adim: 384
344
+ aheads: 2
345
+ elayers: 4
346
+ eunits: 1536
347
+ dlayers: 4
348
+ dunits: 1536
349
+ positionwise_layer_type: conv1d
350
+ positionwise_conv_kernel_size: 3
351
+ duration_predictor_layers: 2
352
+ duration_predictor_chans: 256
353
+ duration_predictor_kernel_size: 3
354
+ postnet_layers: 5
355
+ postnet_filts: 5
356
+ postnet_chans: 256
357
+ use_masking: true
358
+ encoder_normalize_before: true
359
+ decoder_normalize_before: true
360
+ reduction_factor: 1
361
+ encoder_type: conformer
362
+ decoder_type: conformer
363
+ conformer_pos_enc_layer_type: rel_pos
364
+ conformer_self_attn_layer_type: rel_selfattn
365
+ conformer_activation_type: swish
366
+ use_macaron_style_in_conformer: true
367
+ use_cnn_in_conformer: true
368
+ conformer_enc_kernel_size: 7
369
+ conformer_dec_kernel_size: 31
370
+ init_type: xavier_uniform
371
+ transformer_enc_dropout_rate: 0.2
372
+ transformer_enc_positional_dropout_rate: 0.2
373
+ transformer_enc_attn_dropout_rate: 0.2
374
+ transformer_dec_dropout_rate: 0.2
375
+ transformer_dec_positional_dropout_rate: 0.2
376
+ transformer_dec_attn_dropout_rate: 0.2
377
+ pitch_predictor_layers: 5
378
+ pitch_predictor_chans: 256
379
+ pitch_predictor_kernel_size: 5
380
+ pitch_predictor_dropout: 0.5
381
+ pitch_embed_kernel_size: 1
382
+ pitch_embed_dropout: 0.0
383
+ stop_gradient_from_pitch_predictor: true
384
+ energy_predictor_layers: 2
385
+ energy_predictor_chans: 256
386
+ energy_predictor_kernel_size: 3
387
+ energy_predictor_dropout: 0.5
388
+ energy_embed_kernel_size: 1
389
+ energy_embed_dropout: 0.0
390
+ stop_gradient_from_energy_predictor: false
391
+ spk_embed_dim: 512
392
+ spk_embed_integration_type: add
393
+ use_gst: true
394
+ gst_heads: 4
395
+ gst_tokens: 16
396
+ encoder_use_prompt: false
397
+ decoder_use_prompt: false
398
+ intermediate_use_prompt: false
399
+ pitch_extract: dio
400
+ pitch_extract_conf:
401
+ fs: 24000
402
+ n_fft: 2048
403
+ hop_length: 300
404
+ f0max: 400
405
+ f0min: 80
406
+ reduction_factor: 1
407
+ pitch_normalize: global_mvn
408
+ pitch_normalize_conf:
409
+ stats_file: exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/pitch_stats.npz
410
+ energy_extract: energy
411
+ energy_extract_conf:
412
+ fs: 24000
413
+ n_fft: 2048
414
+ hop_length: 300
415
+ win_length: 1200
416
+ reduction_factor: 1
417
+ energy_normalize: global_mvn
418
+ energy_normalize_conf:
419
+ stats_file: exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/energy_stats.npz
420
+ required:
421
+ - output_dir
422
+ - token_list
423
+ version: '202211'
424
+ distributed: false
exp/tts_finetune_fastpeech2_g2pw/images/backward_time.png ADDED
exp/tts_finetune_fastpeech2_g2pw/images/duration_loss.png ADDED
exp/tts_finetune_fastpeech2_g2pw/images/energy_loss.png ADDED
exp/tts_finetune_fastpeech2_g2pw/images/forward_time.png ADDED
exp/tts_finetune_fastpeech2_g2pw/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_finetune_fastpeech2_g2pw/images/iter_time.png ADDED
exp/tts_finetune_fastpeech2_g2pw/images/l1_loss.png ADDED
exp/tts_finetune_fastpeech2_g2pw/images/loss.png ADDED
exp/tts_finetune_fastpeech2_g2pw/images/optim0_lr0.png ADDED
exp/tts_finetune_fastpeech2_g2pw/images/optim_step_time.png ADDED
exp/tts_finetune_fastpeech2_g2pw/images/pitch_loss.png ADDED
exp/tts_finetune_fastpeech2_g2pw/images/train_time.png ADDED
exp/tts_finetune_fastpeech2_g2pw/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0279cc5af6bfcb1c0538153292fd24d55ee42683e0301a25c5dd947b6df23486
3
+ size 285366631
exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74a78953f9a18a32473fe4c496d0237235dc1c906398ad35cdc5b1d9055641af
3
+ size 770
exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e111fcbdd087d57371949cdedfc907d5ab3570be551488517f33854b5a1fdd
3
+ size 1402
exp/tts_finetune_tacotron2_raw_phn_chatbot_own_model/stats_phn/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d94366f1978601ed2740155cda43f65a9b15dc12745336e92a694d23b87db752
3
+ size 770
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202211'
2
+ files:
3
+ model_file: exp/tts_finetune_fastpeech2_g2pw/train.loss.ave_5best.pth
4
+ python: "3.8.15 (default, Nov 24 2022, 15:19:38) \n[GCC 11.2.0]"
5
+ timestamp: 1674624833.444729
6
+ torch: 1.10.1
7
+ yaml_files:
8
+ train_config: exp/tts_finetune_fastpeech2_g2pw/config.yaml