ESPnet
multilingual
audio
codec
ftshijt commited on
Commit
f14b8d9
1 Parent(s): 5901f36

Update model

Browse files
Files changed (29) hide show
  1. README.md +346 -3
  2. exp_16k/codec_soundstream4_16k_speech/254epoch.pth +3 -0
  3. exp_16k/codec_soundstream4_16k_speech/config.yaml +271 -0
  4. exp_16k/codec_soundstream4_16k_speech/images/adv_loss.png +0 -0
  5. exp_16k/codec_soundstream4_16k_speech/images/codec_commit_loss.png +0 -0
  6. exp_16k/codec_soundstream4_16k_speech/images/codec_loss.png +0 -0
  7. exp_16k/codec_soundstream4_16k_speech/images/codec_quantization_loss.png +0 -0
  8. exp_16k/codec_soundstream4_16k_speech/images/discriminator_backward_time.png +0 -0
  9. exp_16k/codec_soundstream4_16k_speech/images/discriminator_forward_time.png +0 -0
  10. exp_16k/codec_soundstream4_16k_speech/images/discriminator_loss.png +0 -0
  11. exp_16k/codec_soundstream4_16k_speech/images/discriminator_optim_step_time.png +0 -0
  12. exp_16k/codec_soundstream4_16k_speech/images/discriminator_train_time.png +0 -0
  13. exp_16k/codec_soundstream4_16k_speech/images/fake_loss.png +0 -0
  14. exp_16k/codec_soundstream4_16k_speech/images/feat_match_loss.png +0 -0
  15. exp_16k/codec_soundstream4_16k_speech/images/generator_backward_time.png +0 -0
  16. exp_16k/codec_soundstream4_16k_speech/images/generator_forward_time.png +0 -0
  17. exp_16k/codec_soundstream4_16k_speech/images/generator_optim_step_time.png +0 -0
  18. exp_16k/codec_soundstream4_16k_speech/images/generator_train_time.png +0 -0
  19. exp_16k/codec_soundstream4_16k_speech/images/gpu_max_cached_mem_GB.png +0 -0
  20. exp_16k/codec_soundstream4_16k_speech/images/iter_time.png +0 -0
  21. exp_16k/codec_soundstream4_16k_speech/images/loss.png +0 -0
  22. exp_16k/codec_soundstream4_16k_speech/images/mel_loss.png +0 -0
  23. exp_16k/codec_soundstream4_16k_speech/images/mel_loss_real.png +0 -0
  24. exp_16k/codec_soundstream4_16k_speech/images/optim0_lr0.png +0 -0
  25. exp_16k/codec_soundstream4_16k_speech/images/optim1_lr0.png +0 -0
  26. exp_16k/codec_soundstream4_16k_speech/images/real_loss.png +0 -0
  27. exp_16k/codec_soundstream4_16k_speech/images/reconstruct_loss.png +0 -0
  28. exp_16k/codec_soundstream4_16k_speech/images/train_time.png +0 -0
  29. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,346 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - codec
6
+ language: multilingual
7
+ datasets:
8
+ - amuse
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 Codec model
13
+
14
+ ### `espnet/amuse_speech_soundstream_16k`
15
+
16
+ This model was trained by ftshijt using amuse recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 5201685018b0e8fb9826bc51a710623140a06627
26
+ pip install -e .
27
+ cd egs2/amuse/codec1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/amuse_speech_soundstream_16k
29
+ ```
30
+
31
+
32
+
33
+ ## Codec config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/train_soundstream4_fs16000.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: chunk
44
+ valid_iterator_type: null
45
+ output_dir: exp_16k/codec_soundstream4_16k_speech
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 1
49
+ num_att_plot: 0
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: 4
53
+ dist_rank: 0
54
+ local_rank: 0
55
+ dist_master_addr: localhost
56
+ dist_master_port: 48299
57
+ dist_launcher: null
58
+ multiprocessing_distributed: true
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ use_tf32: true
65
+ collect_stats: false
66
+ write_collected_feats: false
67
+ max_epoch: 360
68
+ patience: null
69
+ val_scheduler_criterion:
70
+ - valid
71
+ - loss
72
+ early_stopping_criterion:
73
+ - valid
74
+ - loss
75
+ - min
76
+ best_model_criterion:
77
+ - - valid
78
+ - mel_loss
79
+ - min
80
+ - - train
81
+ - mel_loss
82
+ - min
83
+ - - train
84
+ - total_count
85
+ - max
86
+ keep_nbest_models: 5
87
+ nbest_averaging_interval: 0
88
+ grad_clip: -1
89
+ grad_clip_type: 2.0
90
+ grad_noise: false
91
+ accum_grad: 1
92
+ no_forward_run: false
93
+ resume: true
94
+ train_dtype: float32
95
+ use_amp: false
96
+ log_interval: 50
97
+ use_matplotlib: true
98
+ use_tensorboard: true
99
+ create_graph_in_tensorboard: false
100
+ use_wandb: false
101
+ wandb_project: null
102
+ wandb_id: null
103
+ wandb_entity: null
104
+ wandb_name: null
105
+ wandb_model_log_interval: -1
106
+ detect_anomaly: false
107
+ use_adapter: false
108
+ adapter: lora
109
+ save_strategy: all
110
+ adapter_conf: {}
111
+ pretrain_path: null
112
+ init_param: []
113
+ ignore_init_mismatch: false
114
+ freeze_param: []
115
+ num_iters_per_epoch: 5000
116
+ batch_size: 64
117
+ valid_batch_size: null
118
+ batch_bins: 1000000
119
+ valid_batch_bins: null
120
+ train_shape_file:
121
+ - exp_16k/codec_stats_speech/train/audio_shape
122
+ valid_shape_file:
123
+ - exp_16k/codec_stats_speech/valid/audio_shape
124
+ batch_type: unsorted
125
+ valid_batch_type: null
126
+ fold_length:
127
+ - 256000
128
+ sort_in_batch: descending
129
+ shuffle_within_batch: false
130
+ sort_batch: descending
131
+ multiple_iterator: false
132
+ chunk_length: 32000
133
+ chunk_shift_ratio: 0.5
134
+ num_cache_chunks: 128
135
+ chunk_excluded_key_prefixes: []
136
+ chunk_default_fs: null
137
+ train_data_path_and_name_and_type:
138
+ - - dump_16k/raw/speech_train/wav.scp
139
+ - audio
140
+ - kaldi_ark
141
+ valid_data_path_and_name_and_type:
142
+ - - dump_16k/raw/dev-small/wav.scp
143
+ - audio
144
+ - kaldi_ark
145
+ multi_task_dataset: false
146
+ allow_variable_data_keys: false
147
+ max_cache_size: 0.0
148
+ max_cache_fd: 32
149
+ allow_multi_rates: false
150
+ valid_max_cache_size: null
151
+ exclude_weight_decay: false
152
+ exclude_weight_decay_conf: {}
153
+ optim: adam
154
+ optim_conf:
155
+ lr: 0.0002
156
+ betas:
157
+ - 0.5
158
+ - 0.9
159
+ eps: 1.0e-09
160
+ weight_decay: 0.0
161
+ scheduler: exponentiallr
162
+ scheduler_conf:
163
+ gamma: 0.999875
164
+ optim2: adam
165
+ optim2_conf:
166
+ lr: 0.0002
167
+ betas:
168
+ - 0.5
169
+ - 0.9
170
+ eps: 1.0e-09
171
+ weight_decay: 0.0
172
+ scheduler2: exponentiallr
173
+ scheduler2_conf:
174
+ gamma: 0.999875
175
+ generator_first: true
176
+ skip_discriminator_prob: 0.0
177
+ model_conf: {}
178
+ use_preprocessor: true
179
+ codec: soundstream
180
+ codec_conf:
181
+ sampling_rate: 16000
182
+ generator_params:
183
+ hidden_dim: 512
184
+ encdec_channels: 1
185
+ encdec_n_filters: 32
186
+ encdec_n_residual_layers: 3
187
+ encdec_ratios:
188
+ - 8
189
+ - 5
190
+ - 4
191
+ - 2
192
+ encdec_activation: ELU
193
+ encdec_activation_params:
194
+ alpha: 1.0
195
+ encdec_norm: weight_norm
196
+ encdec_kernel_size: 7
197
+ encdec_residual_kernel_size: 7
198
+ encdec_last_kernel_size: 7
199
+ encdec_dilation_base: 2
200
+ encdec_causal: false
201
+ encdec_pad_mode: reflect
202
+ encdec_true_skip: false
203
+ encdec_compress: 2
204
+ encdec_lstm: 2
205
+ decoder_trim_right_ratio: 1.0
206
+ decoder_final_activation: null
207
+ decoder_final_activation_params: null
208
+ quantizer_n_q: 32
209
+ quantizer_bins: 1024
210
+ quantizer_decay: 0.99
211
+ quantizer_kmeans_init: true
212
+ quantizer_kmeans_iters: 50
213
+ quantizer_threshold_ema_dead_code: 2
214
+ quantizer_target_bandwidth:
215
+ - 2
216
+ - 4
217
+ - 8
218
+ - 16
219
+ - 32
220
+ sample_rate: 16000
221
+ discriminator_params:
222
+ scales: 3
223
+ scale_downsample_pooling: AvgPool1d
224
+ scale_downsample_pooling_params:
225
+ kernel_size: 4
226
+ stride: 2
227
+ padding: 2
228
+ scale_discriminator_params:
229
+ in_channels: 1
230
+ out_channels: 1
231
+ kernel_sizes:
232
+ - 15
233
+ - 41
234
+ - 5
235
+ - 3
236
+ channels: 128
237
+ max_downsample_channels: 1024
238
+ max_groups: 16
239
+ bias: true
240
+ downsample_scales:
241
+ - 2
242
+ - 2
243
+ - 4
244
+ - 4
245
+ - 1
246
+ nonlinear_activation: LeakyReLU
247
+ nonlinear_activation_params:
248
+ negative_slope: 0.1
249
+ scale_follow_official_norm: false
250
+ complexstft_discriminator_params:
251
+ in_channels: 1
252
+ channels: 32
253
+ strides:
254
+ - - 1
255
+ - 2
256
+ - - 2
257
+ - 2
258
+ - - 1
259
+ - 2
260
+ - - 2
261
+ - 2
262
+ - - 1
263
+ - 2
264
+ - - 2
265
+ - 2
266
+ chan_mults:
267
+ - 1
268
+ - 2
269
+ - 4
270
+ - 4
271
+ - 8
272
+ - 8
273
+ n_fft: 1024
274
+ hop_length: 256
275
+ win_length: 1024
276
+ stft_normalized: false
277
+ generator_adv_loss_params:
278
+ average_by_discriminators: false
279
+ loss_type: mse
280
+ discriminator_adv_loss_params:
281
+ average_by_discriminators: false
282
+ loss_type: mse
283
+ use_feat_match_loss: true
284
+ feat_match_loss_params:
285
+ average_by_discriminators: false
286
+ average_by_layers: false
287
+ include_final_outputs: true
288
+ use_mel_loss: true
289
+ mel_loss_params:
290
+ range_start: 6
291
+ range_end: 11
292
+ window: hann
293
+ n_mels: 80
294
+ fmin: 0
295
+ fmax: null
296
+ log_base: null
297
+ fs: 16000
298
+ lambda_quantization: 0.0
299
+ lambda_commit: 1.0
300
+ lambda_reconstruct: 1.0
301
+ lambda_adv: 1.0
302
+ lambda_mel: 45.0
303
+ lambda_feat_match: 2.0
304
+ cache_generator_outputs: true
305
+ required:
306
+ - output_dir
307
+ version: '202402'
308
+ distributed: true
309
+ ```
310
+
311
+ </details>
312
+
313
+
314
+
315
+ ### Citing ESPnet
316
+
317
+ ```BibTex
318
+ @inproceedings{watanabe2018espnet,
319
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
320
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
321
+ year={2018},
322
+ booktitle={Proceedings of Interspeech},
323
+ pages={2207--2211},
324
+ doi={10.21437/Interspeech.2018-1456},
325
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
326
+ }
327
+
328
+
329
+
330
+
331
+
332
+
333
+ ```
334
+
335
+ or arXiv:
336
+
337
+ ```bibtex
338
+ @misc{watanabe2018espnet,
339
+ title={ESPnet: End-to-End Speech Processing Toolkit},
340
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
341
+ year={2018},
342
+ eprint={1804.00015},
343
+ archivePrefix={arXiv},
344
+ primaryClass={cs.CL}
345
+ }
346
+ ```
exp_16k/codec_soundstream4_16k_speech/254epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c1ae71483a67bc25d29f617cb41cc4f4bb91ff8f597959c1e3f15558bb76697
3
+ size 354547787
exp_16k/codec_soundstream4_16k_speech/config.yaml ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_soundstream4_fs16000.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: chunk
7
+ valid_iterator_type: null
8
+ output_dir: exp_16k/codec_soundstream4_16k_speech
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 1
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: 4
16
+ dist_rank: 0
17
+ local_rank: 0
18
+ dist_master_addr: localhost
19
+ dist_master_port: 48299
20
+ dist_launcher: null
21
+ multiprocessing_distributed: true
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ use_tf32: true
28
+ collect_stats: false
29
+ write_collected_feats: false
30
+ max_epoch: 360
31
+ patience: null
32
+ val_scheduler_criterion:
33
+ - valid
34
+ - loss
35
+ early_stopping_criterion:
36
+ - valid
37
+ - loss
38
+ - min
39
+ best_model_criterion:
40
+ - - valid
41
+ - mel_loss
42
+ - min
43
+ - - train
44
+ - mel_loss
45
+ - min
46
+ - - train
47
+ - total_count
48
+ - max
49
+ keep_nbest_models: 5
50
+ nbest_averaging_interval: 0
51
+ grad_clip: -1
52
+ grad_clip_type: 2.0
53
+ grad_noise: false
54
+ accum_grad: 1
55
+ no_forward_run: false
56
+ resume: true
57
+ train_dtype: float32
58
+ use_amp: false
59
+ log_interval: 50
60
+ use_matplotlib: true
61
+ use_tensorboard: true
62
+ create_graph_in_tensorboard: false
63
+ use_wandb: false
64
+ wandb_project: null
65
+ wandb_id: null
66
+ wandb_entity: null
67
+ wandb_name: null
68
+ wandb_model_log_interval: -1
69
+ detect_anomaly: false
70
+ use_adapter: false
71
+ adapter: lora
72
+ save_strategy: all
73
+ adapter_conf: {}
74
+ pretrain_path: null
75
+ init_param: []
76
+ ignore_init_mismatch: false
77
+ freeze_param: []
78
+ num_iters_per_epoch: 5000
79
+ batch_size: 64
80
+ valid_batch_size: null
81
+ batch_bins: 1000000
82
+ valid_batch_bins: null
83
+ train_shape_file:
84
+ - exp_16k/codec_stats_speech/train/audio_shape
85
+ valid_shape_file:
86
+ - exp_16k/codec_stats_speech/valid/audio_shape
87
+ batch_type: unsorted
88
+ valid_batch_type: null
89
+ fold_length:
90
+ - 256000
91
+ sort_in_batch: descending
92
+ shuffle_within_batch: false
93
+ sort_batch: descending
94
+ multiple_iterator: false
95
+ chunk_length: 32000
96
+ chunk_shift_ratio: 0.5
97
+ num_cache_chunks: 128
98
+ chunk_excluded_key_prefixes: []
99
+ chunk_default_fs: null
100
+ train_data_path_and_name_and_type:
101
+ - - dump_16k/raw/speech_train/wav.scp
102
+ - audio
103
+ - kaldi_ark
104
+ valid_data_path_and_name_and_type:
105
+ - - dump_16k/raw/dev-small/wav.scp
106
+ - audio
107
+ - kaldi_ark
108
+ multi_task_dataset: false
109
+ allow_variable_data_keys: false
110
+ max_cache_size: 0.0
111
+ max_cache_fd: 32
112
+ allow_multi_rates: false
113
+ valid_max_cache_size: null
114
+ exclude_weight_decay: false
115
+ exclude_weight_decay_conf: {}
116
+ optim: adam
117
+ optim_conf:
118
+ lr: 0.0002
119
+ betas:
120
+ - 0.5
121
+ - 0.9
122
+ eps: 1.0e-09
123
+ weight_decay: 0.0
124
+ scheduler: exponentiallr
125
+ scheduler_conf:
126
+ gamma: 0.999875
127
+ optim2: adam
128
+ optim2_conf:
129
+ lr: 0.0002
130
+ betas:
131
+ - 0.5
132
+ - 0.9
133
+ eps: 1.0e-09
134
+ weight_decay: 0.0
135
+ scheduler2: exponentiallr
136
+ scheduler2_conf:
137
+ gamma: 0.999875
138
+ generator_first: true
139
+ skip_discriminator_prob: 0.0
140
+ model_conf: {}
141
+ use_preprocessor: true
142
+ codec: soundstream
143
+ codec_conf:
144
+ sampling_rate: 16000
145
+ generator_params:
146
+ hidden_dim: 512
147
+ encdec_channels: 1
148
+ encdec_n_filters: 32
149
+ encdec_n_residual_layers: 3
150
+ encdec_ratios:
151
+ - 8
152
+ - 5
153
+ - 4
154
+ - 2
155
+ encdec_activation: ELU
156
+ encdec_activation_params:
157
+ alpha: 1.0
158
+ encdec_norm: weight_norm
159
+ encdec_kernel_size: 7
160
+ encdec_residual_kernel_size: 7
161
+ encdec_last_kernel_size: 7
162
+ encdec_dilation_base: 2
163
+ encdec_causal: false
164
+ encdec_pad_mode: reflect
165
+ encdec_true_skip: false
166
+ encdec_compress: 2
167
+ encdec_lstm: 2
168
+ decoder_trim_right_ratio: 1.0
169
+ decoder_final_activation: null
170
+ decoder_final_activation_params: null
171
+ quantizer_n_q: 32
172
+ quantizer_bins: 1024
173
+ quantizer_decay: 0.99
174
+ quantizer_kmeans_init: true
175
+ quantizer_kmeans_iters: 50
176
+ quantizer_threshold_ema_dead_code: 2
177
+ quantizer_target_bandwidth:
178
+ - 2
179
+ - 4
180
+ - 8
181
+ - 16
182
+ - 32
183
+ sample_rate: 16000
184
+ discriminator_params:
185
+ scales: 3
186
+ scale_downsample_pooling: AvgPool1d
187
+ scale_downsample_pooling_params:
188
+ kernel_size: 4
189
+ stride: 2
190
+ padding: 2
191
+ scale_discriminator_params:
192
+ in_channels: 1
193
+ out_channels: 1
194
+ kernel_sizes:
195
+ - 15
196
+ - 41
197
+ - 5
198
+ - 3
199
+ channels: 128
200
+ max_downsample_channels: 1024
201
+ max_groups: 16
202
+ bias: true
203
+ downsample_scales:
204
+ - 2
205
+ - 2
206
+ - 4
207
+ - 4
208
+ - 1
209
+ nonlinear_activation: LeakyReLU
210
+ nonlinear_activation_params:
211
+ negative_slope: 0.1
212
+ scale_follow_official_norm: false
213
+ complexstft_discriminator_params:
214
+ in_channels: 1
215
+ channels: 32
216
+ strides:
217
+ - - 1
218
+ - 2
219
+ - - 2
220
+ - 2
221
+ - - 1
222
+ - 2
223
+ - - 2
224
+ - 2
225
+ - - 1
226
+ - 2
227
+ - - 2
228
+ - 2
229
+ chan_mults:
230
+ - 1
231
+ - 2
232
+ - 4
233
+ - 4
234
+ - 8
235
+ - 8
236
+ n_fft: 1024
237
+ hop_length: 256
238
+ win_length: 1024
239
+ stft_normalized: false
240
+ generator_adv_loss_params:
241
+ average_by_discriminators: false
242
+ loss_type: mse
243
+ discriminator_adv_loss_params:
244
+ average_by_discriminators: false
245
+ loss_type: mse
246
+ use_feat_match_loss: true
247
+ feat_match_loss_params:
248
+ average_by_discriminators: false
249
+ average_by_layers: false
250
+ include_final_outputs: true
251
+ use_mel_loss: true
252
+ mel_loss_params:
253
+ range_start: 6
254
+ range_end: 11
255
+ window: hann
256
+ n_mels: 80
257
+ fmin: 0
258
+ fmax: null
259
+ log_base: null
260
+ fs: 16000
261
+ lambda_quantization: 0.0
262
+ lambda_commit: 1.0
263
+ lambda_reconstruct: 1.0
264
+ lambda_adv: 1.0
265
+ lambda_mel: 45.0
266
+ lambda_feat_match: 2.0
267
+ cache_generator_outputs: true
268
+ required:
269
+ - output_dir
270
+ version: '202402'
271
+ distributed: true
exp_16k/codec_soundstream4_16k_speech/images/adv_loss.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/codec_commit_loss.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/codec_loss.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/codec_quantization_loss.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/discriminator_backward_time.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/discriminator_forward_time.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/discriminator_loss.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/discriminator_optim_step_time.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/discriminator_train_time.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/fake_loss.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/feat_match_loss.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/generator_backward_time.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/generator_forward_time.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/generator_optim_step_time.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/generator_train_time.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/gpu_max_cached_mem_GB.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/iter_time.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/loss.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/mel_loss.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/mel_loss_real.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/optim0_lr0.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/optim1_lr0.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/real_loss.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/reconstruct_loss.png ADDED
exp_16k/codec_soundstream4_16k_speech/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202402'
2
+ files:
3
+ model_file: exp_16k/codec_soundstream4_16k_speech/254epoch.pth
4
+ python: 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:45:18) [GCC 12.3.0]
5
+ timestamp: 1721169926.020467
6
+ torch: 2.3.0+cu118
7
+ yaml_files:
8
+ train_config: exp_16k/codec_soundstream4_16k_speech/config.yaml