GunnarThor commited on
Commit
df7cc2b
1 Parent(s): 3c45ba2

Update model

Browse files
README.md CHANGED
@@ -1,3 +1,338 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - talromur
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `GunnarThor/talromur_f_tacotron2`
15
+
16
+ This model was trained by Gunnar Thor using talromur recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+ git checkout 81522029063e42ce807d9d145b64d3f9aca45987
23
+ pip install -e .
24
+ cd egs2/talromur/tts1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model GunnarThor/talromur_f_tacotron2
26
+ ```
27
+
28
+
29
+
30
+ ## TTS config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: ./conf/tuning/train_tacotron2.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: sequence
40
+ output_dir: exp_f/tts_train_tacotron2_raw_phn_none
41
+ ngpu: 1
42
+ seed: 0
43
+ num_workers: 1
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: 2
48
+ dist_rank: 0
49
+ local_rank: 0
50
+ dist_master_addr: localhost
51
+ dist_master_port: 55005
52
+ dist_launcher: null
53
+ multiprocessing_distributed: true
54
+ unused_parameters: false
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: true
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ max_epoch: 200
62
+ patience: null
63
+ val_scheduler_criterion:
64
+ - valid
65
+ - loss
66
+ early_stopping_criterion:
67
+ - valid
68
+ - loss
69
+ - min
70
+ best_model_criterion:
71
+ - - valid
72
+ - loss
73
+ - min
74
+ - - train
75
+ - loss
76
+ - min
77
+ keep_nbest_models: 5
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 1.0
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: null
88
+ use_tensorboard: true
89
+ use_wandb: false
90
+ wandb_project: null
91
+ wandb_id: null
92
+ wandb_entity: null
93
+ wandb_name: null
94
+ wandb_model_log_interval: -1
95
+ detect_anomaly: false
96
+ pretrain_path: null
97
+ init_param: []
98
+ ignore_init_mismatch: false
99
+ freeze_param: []
100
+ num_iters_per_epoch: 500
101
+ batch_size: 20
102
+ valid_batch_size: null
103
+ batch_bins: 5120000
104
+ valid_batch_bins: null
105
+ train_shape_file:
106
+ - exp_f/tts_stats_raw_phn_none/train/text_shape.phn
107
+ - exp_f/tts_stats_raw_phn_none/train/speech_shape
108
+ valid_shape_file:
109
+ - exp_f/tts_stats_raw_phn_none/valid/text_shape.phn
110
+ - exp_f/tts_stats_raw_phn_none/valid/speech_shape
111
+ batch_type: numel
112
+ valid_batch_type: null
113
+ fold_length:
114
+ - 150
115
+ - 204800
116
+ sort_in_batch: descending
117
+ sort_batch: descending
118
+ multiple_iterator: false
119
+ chunk_length: 500
120
+ chunk_shift_ratio: 0.5
121
+ num_cache_chunks: 1024
122
+ train_data_path_and_name_and_type:
123
+ - - dump/raw/train_f_phn/text
124
+ - text
125
+ - text
126
+ - - dump/raw/train_f_phn/wav.scp
127
+ - speech
128
+ - sound
129
+ valid_data_path_and_name_and_type:
130
+ - - dump/raw/dev_f_phn/text
131
+ - text
132
+ - text
133
+ - - dump/raw/dev_f_phn/wav.scp
134
+ - speech
135
+ - sound
136
+ allow_variable_data_keys: false
137
+ max_cache_size: 0.0
138
+ max_cache_fd: 32
139
+ valid_max_cache_size: null
140
+ optim: adam
141
+ optim_conf:
142
+ lr: 0.001
143
+ eps: 1.0e-06
144
+ weight_decay: 0.0
145
+ scheduler: null
146
+ scheduler_conf: {}
147
+ token_list:
148
+ - <blank>
149
+ - <unk>
150
+ - ','
151
+ - .
152
+ - r
153
+ - t
154
+ - n
155
+ - a0
156
+ - s
157
+ - I0
158
+ - D
159
+ - l
160
+ - Y0
161
+ - m
162
+ - v
163
+ - h
164
+ - k
165
+ - E1
166
+ - a:1
167
+ - E:1
168
+ - f
169
+ - G
170
+ - j
171
+ - a1
172
+ - T
173
+ - p
174
+ - c
175
+ - au:1
176
+ - E0
177
+ - i:1
178
+ - O:1
179
+ - I:1
180
+ - I1
181
+ - r_0
182
+ - t_h
183
+ - k_h
184
+ - Y1
185
+ - ei1
186
+ - i0
187
+ - ei:1
188
+ - ou:1
189
+ - u:1
190
+ - O1
191
+ - N
192
+ - l_0
193
+ - '91'
194
+ - ai0
195
+ - au1
196
+ - ou0
197
+ - ai:1
198
+ - n_0
199
+ - ei0
200
+ - O0
201
+ - ou1
202
+ - i1
203
+ - '9:1'
204
+ - ai1
205
+ - '90'
206
+ - au0
207
+ - x
208
+ - c_h
209
+ - 9i:1
210
+ - C
211
+ - p_h
212
+ - u0
213
+ - Y:1
214
+ - J
215
+ - 9i1
216
+ - u1
217
+ - 9i0
218
+ - N_0
219
+ - m_0
220
+ - J_0
221
+ - Yi0
222
+ - Oi1
223
+ - Yi1
224
+ - Oi0
225
+ - au:0
226
+ - '9:0'
227
+ - E:0
228
+ - <sos/eos>
229
+ odim: null
230
+ model_conf: {}
231
+ use_preprocessor: true
232
+ token_type: phn
233
+ bpemodel: null
234
+ non_linguistic_symbols: null
235
+ cleaner: null
236
+ g2p: null
237
+ feats_extract: fbank
238
+ feats_extract_conf:
239
+ n_fft: 1024
240
+ hop_length: 256
241
+ win_length: null
242
+ fs: 22050
243
+ fmin: 80
244
+ fmax: 7600
245
+ n_mels: 80
246
+ normalize: global_mvn
247
+ normalize_conf:
248
+ stats_file: exp_f/tts_stats_raw_phn_none/train/feats_stats.npz
249
+ tts: tacotron2
250
+ tts_conf:
251
+ embed_dim: 512
252
+ elayers: 1
253
+ eunits: 512
254
+ econv_layers: 3
255
+ econv_chans: 512
256
+ econv_filts: 5
257
+ atype: location
258
+ adim: 512
259
+ aconv_chans: 32
260
+ aconv_filts: 15
261
+ cumulate_att_w: true
262
+ dlayers: 2
263
+ dunits: 1024
264
+ prenet_layers: 2
265
+ prenet_units: 256
266
+ postnet_layers: 5
267
+ postnet_chans: 512
268
+ postnet_filts: 5
269
+ output_activation: null
270
+ use_batch_norm: true
271
+ use_concate: true
272
+ use_residual: false
273
+ dropout_rate: 0.5
274
+ zoneout_rate: 0.1
275
+ reduction_factor: 1
276
+ spk_embed_dim: null
277
+ use_masking: true
278
+ bce_pos_weight: 5.0
279
+ use_guided_attn_loss: true
280
+ guided_attn_loss_sigma: 0.4
281
+ guided_attn_loss_lambda: 1.0
282
+ pitch_extract: null
283
+ pitch_extract_conf: {}
284
+ pitch_normalize: null
285
+ pitch_normalize_conf: {}
286
+ energy_extract: null
287
+ energy_extract_conf: {}
288
+ energy_normalize: null
289
+ energy_normalize_conf: {}
290
+ required:
291
+ - output_dir
292
+ - token_list
293
+ version: 0.10.5a1
294
+ distributed: true
295
+ ```
296
+
297
+ </details>
298
+
299
+
300
+
301
+ ### Citing ESPnet
302
+
303
+ ```BibTex
304
+ @inproceedings{watanabe2018espnet,
305
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
306
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
307
+ year={2018},
308
+ booktitle={Proceedings of Interspeech},
309
+ pages={2207--2211},
310
+ doi={10.21437/Interspeech.2018-1456},
311
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
312
+ }
313
+
314
+
315
+
316
+
317
+ @inproceedings{hayashi2020espnet,
318
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
319
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
320
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
321
+ pages={7654--7658},
322
+ year={2020},
323
+ organization={IEEE}
324
+ }
325
+ ```
326
+
327
+ or arXiv:
328
+
329
+ ```bibtex
330
+ @misc{watanabe2018espnet,
331
+ title={ESPnet: End-to-End Speech Processing Toolkit},
332
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
333
+ year={2018},
334
+ eprint={1804.00015},
335
+ archivePrefix={arXiv},
336
+ primaryClass={cs.CL}
337
+ }
338
+ ```
exp_f/tts_stats_raw_phn_none/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp_f/tts_train_tacotron2_raw_phn_none/config.yaml ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/train_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp_f/tts_train_tacotron2_raw_phn_none
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 2
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 55005
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 200
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_tensorboard: true
55
+ use_wandb: false
56
+ wandb_project: null
57
+ wandb_id: null
58
+ wandb_entity: null
59
+ wandb_name: null
60
+ wandb_model_log_interval: -1
61
+ detect_anomaly: false
62
+ pretrain_path: null
63
+ init_param: []
64
+ ignore_init_mismatch: false
65
+ freeze_param: []
66
+ num_iters_per_epoch: 500
67
+ batch_size: 20
68
+ valid_batch_size: null
69
+ batch_bins: 5120000
70
+ valid_batch_bins: null
71
+ train_shape_file:
72
+ - exp_f/tts_stats_raw_phn_none/train/text_shape.phn
73
+ - exp_f/tts_stats_raw_phn_none/train/speech_shape
74
+ valid_shape_file:
75
+ - exp_f/tts_stats_raw_phn_none/valid/text_shape.phn
76
+ - exp_f/tts_stats_raw_phn_none/valid/speech_shape
77
+ batch_type: numel
78
+ valid_batch_type: null
79
+ fold_length:
80
+ - 150
81
+ - 204800
82
+ sort_in_batch: descending
83
+ sort_batch: descending
84
+ multiple_iterator: false
85
+ chunk_length: 500
86
+ chunk_shift_ratio: 0.5
87
+ num_cache_chunks: 1024
88
+ train_data_path_and_name_and_type:
89
+ - - dump/raw/train_f_phn/text
90
+ - text
91
+ - text
92
+ - - dump/raw/train_f_phn/wav.scp
93
+ - speech
94
+ - sound
95
+ valid_data_path_and_name_and_type:
96
+ - - dump/raw/dev_f_phn/text
97
+ - text
98
+ - text
99
+ - - dump/raw/dev_f_phn/wav.scp
100
+ - speech
101
+ - sound
102
+ allow_variable_data_keys: false
103
+ max_cache_size: 0.0
104
+ max_cache_fd: 32
105
+ valid_max_cache_size: null
106
+ optim: adam
107
+ optim_conf:
108
+ lr: 0.001
109
+ eps: 1.0e-06
110
+ weight_decay: 0.0
111
+ scheduler: null
112
+ scheduler_conf: {}
113
+ token_list:
114
+ - <blank>
115
+ - <unk>
116
+ - ','
117
+ - .
118
+ - r
119
+ - t
120
+ - n
121
+ - a0
122
+ - s
123
+ - I0
124
+ - D
125
+ - l
126
+ - Y0
127
+ - m
128
+ - v
129
+ - h
130
+ - k
131
+ - E1
132
+ - a:1
133
+ - E:1
134
+ - f
135
+ - G
136
+ - j
137
+ - a1
138
+ - T
139
+ - p
140
+ - c
141
+ - au:1
142
+ - E0
143
+ - i:1
144
+ - O:1
145
+ - I:1
146
+ - I1
147
+ - r_0
148
+ - t_h
149
+ - k_h
150
+ - Y1
151
+ - ei1
152
+ - i0
153
+ - ei:1
154
+ - ou:1
155
+ - u:1
156
+ - O1
157
+ - N
158
+ - l_0
159
+ - '91'
160
+ - ai0
161
+ - au1
162
+ - ou0
163
+ - ai:1
164
+ - n_0
165
+ - ei0
166
+ - O0
167
+ - ou1
168
+ - i1
169
+ - '9:1'
170
+ - ai1
171
+ - '90'
172
+ - au0
173
+ - x
174
+ - c_h
175
+ - 9i:1
176
+ - C
177
+ - p_h
178
+ - u0
179
+ - Y:1
180
+ - J
181
+ - 9i1
182
+ - u1
183
+ - 9i0
184
+ - N_0
185
+ - m_0
186
+ - J_0
187
+ - Yi0
188
+ - Oi1
189
+ - Yi1
190
+ - Oi0
191
+ - au:0
192
+ - '9:0'
193
+ - E:0
194
+ - <sos/eos>
195
+ odim: null
196
+ model_conf: {}
197
+ use_preprocessor: true
198
+ token_type: phn
199
+ bpemodel: null
200
+ non_linguistic_symbols: null
201
+ cleaner: null
202
+ g2p: null
203
+ feats_extract: fbank
204
+ feats_extract_conf:
205
+ n_fft: 1024
206
+ hop_length: 256
207
+ win_length: null
208
+ fs: 22050
209
+ fmin: 80
210
+ fmax: 7600
211
+ n_mels: 80
212
+ normalize: global_mvn
213
+ normalize_conf:
214
+ stats_file: exp_f/tts_stats_raw_phn_none/train/feats_stats.npz
215
+ tts: tacotron2
216
+ tts_conf:
217
+ embed_dim: 512
218
+ elayers: 1
219
+ eunits: 512
220
+ econv_layers: 3
221
+ econv_chans: 512
222
+ econv_filts: 5
223
+ atype: location
224
+ adim: 512
225
+ aconv_chans: 32
226
+ aconv_filts: 15
227
+ cumulate_att_w: true
228
+ dlayers: 2
229
+ dunits: 1024
230
+ prenet_layers: 2
231
+ prenet_units: 256
232
+ postnet_layers: 5
233
+ postnet_chans: 512
234
+ postnet_filts: 5
235
+ output_activation: null
236
+ use_batch_norm: true
237
+ use_concate: true
238
+ use_residual: false
239
+ dropout_rate: 0.5
240
+ zoneout_rate: 0.1
241
+ reduction_factor: 1
242
+ spk_embed_dim: null
243
+ use_masking: true
244
+ bce_pos_weight: 5.0
245
+ use_guided_attn_loss: true
246
+ guided_attn_loss_sigma: 0.4
247
+ guided_attn_loss_lambda: 1.0
248
+ pitch_extract: null
249
+ pitch_extract_conf: {}
250
+ pitch_normalize: null
251
+ pitch_normalize_conf: {}
252
+ energy_extract: null
253
+ energy_extract_conf: {}
254
+ energy_normalize: null
255
+ energy_normalize_conf: {}
256
+ required:
257
+ - output_dir
258
+ - token_list
259
+ version: 0.10.5a1
260
+ distributed: true
exp_f/tts_train_tacotron2_raw_phn_none/images/attn_loss.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/images/backward_time.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/images/bce_loss.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/images/forward_time.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/images/gpu_max_cached_mem_GB.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/images/iter_time.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/images/l1_loss.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/images/loss.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/images/mse_loss.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/images/optim0_lr0.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/images/optim_step_time.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/images/train_time.png ADDED
exp_f/tts_train_tacotron2_raw_phn_none/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a24fd3e7ddce4b77f437b014daf7865a37ba4f1d4ca761ecb9b330940342b3d
3
+ size 106863980
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.5a1
2
+ files:
3
+ model_file: exp_f/tts_train_tacotron2_raw_phn_none/train.loss.ave_5best.pth
4
+ python: "3.8.12 (default, Oct 12 2021, 13:49:34) \n[GCC 7.5.0]"
5
+ timestamp: 1643827657.249845
6
+ torch: 1.10.0
7
+ yaml_files:
8
+ train_config: exp_f/tts_train_tacotron2_raw_phn_none/config.yaml