Yurii Paniv commited on
Commit
316ae6b
1 Parent(s): dc248e1

Release 6.0.0 model

Browse files
Files changed (7) hide show
  1. .gitignore +1 -0
  2. README.md +10 -7
  3. app.py +6 -17
  4. config.yaml +139 -125
  5. requirements.txt +1 -1
  6. setup.py +2 -2
  7. ukrainian_tts/tts.py +7 -10
.gitignore CHANGED
@@ -135,6 +135,7 @@ dmypy.json
135
  *.pth.tar
136
  *.pth
137
  *.ark
 
138
 
139
  # gradio
140
  gradio_queue.db
 
135
  *.pth.tar
136
  *.pth
137
  *.ark
138
+ *.npz
139
 
140
  # gradio
141
  gradio_queue.db
README.md CHANGED
@@ -38,27 +38,30 @@ If you like my work, please support ❤️ -> [https://send.monobank.ua/jar/48iH
38
  You're welcome to join UA Speech Recognition and Synthesis community: [Telegram https://t.me/speech_recognition_uk](https://t.me/speech_recognition_uk)
39
  # Examples 🤖
40
 
41
- `Tetiana (female)`:
42
 
43
- https://user-images.githubusercontent.com/5759207/224504324-d8236cad-7302-4dfd-9696-7a42b9f05fce.mp4
44
 
45
 
46
  <details>
47
  <summary>More voices 📢🤖</summary>
48
 
49
- `Dmytro (male)`:
50
 
51
- https://user-images.githubusercontent.com/5759207/224504354-f84f74d3-fa46-497c-9604-4b63ba45989f.mp4
52
 
 
53
 
54
- `Lada (female)`:
55
 
56
- https://user-images.githubusercontent.com/5759207/224504360-ec198ac2-647c-4238-99ef-b6f074d633fd.mp4
57
 
 
58
 
59
  `Mykyta (male)`:
60
 
61
- https://user-images.githubusercontent.com/5759207/224504363-0227e8bf-8c1c-49ad-8602-8cbf8feaa82b.mp4
 
62
 
63
  </details>
64
 
 
38
  You're welcome to join UA Speech Recognition and Synthesis community: [Telegram https://t.me/speech_recognition_uk](https://t.me/speech_recognition_uk)
39
  # Examples 🤖
40
 
41
+ `Oleksa (male)`:
42
 
43
+ https://github.com/robinhad/ukrainian-tts/assets/5759207/ace842ef-06d0-4b1f-ad49-5fda92999dbb
44
 
45
 
46
  <details>
47
  <summary>More voices 📢🤖</summary>
48
 
49
+ `Tetiana (female)`:
50
 
51
+ https://github.com/robinhad/ukrainian-tts/assets/5759207/a6ecacf6-62ae-4fc5-b6d5-41e6cdd3d992
52
 
53
+ `Dmytro (male)`:
54
 
55
+ https://github.com/robinhad/ukrainian-tts/assets/5759207/67d3dac9-6626-40ef-98e5-ec194096bbe0
56
 
57
+ `Lada (female)`:
58
 
59
+ https://github.com/robinhad/ukrainian-tts/assets/5759207/fcf558b2-3ff9-4539-ad9e-8455b52223a4
60
 
61
  `Mykyta (male)`:
62
 
63
+ https://github.com/robinhad/ukrainian-tts/assets/5759207/033f5215-3f09-4021-ba19-1f55158445ca
64
+
65
 
66
  </details>
67
 
app.py CHANGED
@@ -43,6 +43,7 @@ class VoiceOption(Enum):
43
  Mykyta = "Микита (чоловічий) 👨"
44
  Lada = "Лада (жіночий) 👩"
45
  Dmytro = "Дмитро (чоловічий) 👨"
 
46
 
47
 
48
  print(f"CUDA available? {is_available()}")
@@ -51,7 +52,7 @@ print(f"CUDA available? {is_available()}")
51
  ukr_tts = TTS(device="cuda" if is_available() else "cpu")
52
 
53
 
54
- def tts(text: str, voice: str, speed: float):
55
  print("============================")
56
  print("Original text:", text)
57
  print("Voice", voice)
@@ -62,6 +63,7 @@ def tts(text: str, voice: str, speed: float):
62
  VoiceOption.Mykyta.value: Voices.Mykyta.value,
63
  VoiceOption.Lada.value: Voices.Lada.value,
64
  VoiceOption.Dmytro.value: Voices.Dmytro.value,
 
65
  }
66
 
67
  speaker_name = voice_mapping[voice]
@@ -72,11 +74,11 @@ def tts(text: str, voice: str, speed: float):
72
 
73
  if getenv("HF_API_TOKEN") is not None:
74
  log_queue.put(
75
- [text, speaker_name, Stress.Dictionary.value, speed, str(datetime.utcnow())]
76
  )
77
 
78
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
79
- _, text = ukr_tts.tts(text, speaker_name, Stress.Dictionary.value, fp, speed)
80
  return fp.name, text
81
 
82
 
@@ -97,9 +99,6 @@ iface = gr.Interface(
97
  choices=[option.value for option in VoiceOption],
98
  value=VoiceOption.Tetiana.value,
99
  ),
100
- gr.components.Slider(
101
- label="Швидкість", minimum=0.5, maximum=2, value=1, step=0.05
102
- ),
103
  ],
104
  outputs=[
105
  gr.components.Audio(label="Output"),
@@ -112,32 +111,22 @@ iface = gr.Interface(
112
  [
113
  "Привіт, як тебе звати?",
114
  VoiceOption.Tetiana.value,
115
- 1,
116
  ],
117
  [
118
  "Введіть, будь ласка, св+оє реч+ення.",
119
  VoiceOption.Dmytro.value,
120
- 1,
121
- ],
122
- [
123
- "Введіть, будь ласка, своє речення.",
124
- VoiceOption.Dmytro.value,
125
- 1.3,
126
  ],
127
  [
128
  "Введіть, будь ласка, своє речення.",
129
- VoiceOption.Mykyta.value,
130
- 1,
131
  ],
132
  [
133
  "Введіть, будь ласка, своє речення.",
134
  VoiceOption.Mykyta.value,
135
- 0.7,
136
  ],
137
  [
138
  "Договір підписано 4 квітня 1949 року.",
139
  VoiceOption.Lada.value,
140
- 0.9,
141
  ],
142
  ],
143
  )
 
43
  Mykyta = "Микита (чоловічий) 👨"
44
  Lada = "Лада (жіночий) 👩"
45
  Dmytro = "Дмитро (чоловічий) 👨"
46
+ Oleksa = "Олекса (чоловічий) 👨"
47
 
48
 
49
  print(f"CUDA available? {is_available()}")
 
52
  ukr_tts = TTS(device="cuda" if is_available() else "cpu")
53
 
54
 
55
+ def tts(text: str, voice: str):
56
  print("============================")
57
  print("Original text:", text)
58
  print("Voice", voice)
 
63
  VoiceOption.Mykyta.value: Voices.Mykyta.value,
64
  VoiceOption.Lada.value: Voices.Lada.value,
65
  VoiceOption.Dmytro.value: Voices.Dmytro.value,
66
+ VoiceOption.Oleksa.value: Voices.Oleksa.value,
67
  }
68
 
69
  speaker_name = voice_mapping[voice]
 
74
 
75
  if getenv("HF_API_TOKEN") is not None:
76
  log_queue.put(
77
+ [text, speaker_name, Stress.Dictionary.value, 1, str(datetime.utcnow())]
78
  )
79
 
80
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
81
+ _, text = ukr_tts.tts(text, speaker_name, Stress.Dictionary.value, fp)
82
  return fp.name, text
83
 
84
 
 
99
  choices=[option.value for option in VoiceOption],
100
  value=VoiceOption.Tetiana.value,
101
  ),
 
 
 
102
  ],
103
  outputs=[
104
  gr.components.Audio(label="Output"),
 
111
  [
112
  "Привіт, як тебе звати?",
113
  VoiceOption.Tetiana.value,
 
114
  ],
115
  [
116
  "Введіть, будь ласка, св+оє реч+ення.",
117
  VoiceOption.Dmytro.value,
 
 
 
 
 
 
118
  ],
119
  [
120
  "Введіть, будь ласка, своє речення.",
121
+ VoiceOption.Oleksa.value,
 
122
  ],
123
  [
124
  "Введіть, будь ласка, своє речення.",
125
  VoiceOption.Mykyta.value,
 
126
  ],
127
  [
128
  "Договір підписано 4 квітня 1949 року.",
129
  VoiceOption.Lada.value,
 
130
  ],
131
  ],
132
  )
config.yaml CHANGED
@@ -1,11 +1,11 @@
1
- config: ./conf/tuning/train_vits.yaml
2
  print_config: false
3
  log_level: INFO
4
  dry_run: false
5
  iterator_type: sequence
6
- output_dir: exp/22k/tts_train_vits_raw_char
7
  ngpu: 1
8
- seed: 3407
9
  num_workers: 4
10
  num_att_plot: 3
11
  dist_backend: nccl
@@ -24,7 +24,7 @@ cudnn_benchmark: false
24
  cudnn_deterministic: false
25
  collect_stats: false
26
  write_collected_feats: false
27
- max_epoch: 1000
28
  patience: null
29
  val_scheduler_criterion:
30
  - valid
@@ -34,10 +34,16 @@ early_stopping_criterion:
34
  - loss
35
  - min
36
  best_model_criterion:
 
 
 
 
 
 
37
  - - train
38
  - total_count
39
  - max
40
- keep_nbest_models: 10
41
  nbest_averaging_interval: 0
42
  grad_clip: -1
43
  grad_clip_type: 2.0
@@ -59,20 +65,23 @@ wandb_name: null
59
  wandb_model_log_interval: -1
60
  detect_anomaly: false
61
  pretrain_path: null
62
- init_param: []
 
 
 
63
  ignore_init_mismatch: false
64
  freeze_param: []
65
  num_iters_per_epoch: null
66
  batch_size: 20
67
  valid_batch_size: null
68
- batch_bins: 1500000
69
  valid_batch_bins: null
70
  train_shape_file:
71
- - exp/22k/tts_stats_raw_linear_spectrogram_char/train/text_shape.char
72
- - exp/22k/tts_stats_raw_linear_spectrogram_char/train/speech_shape
73
  valid_shape_file:
74
- - exp/22k/tts_stats_raw_linear_spectrogram_char/valid/text_shape.char
75
- - exp/22k/tts_stats_raw_linear_spectrogram_char/valid/speech_shape
76
  batch_type: numel
77
  valid_batch_type: null
78
  fold_length:
@@ -110,29 +119,27 @@ max_cache_fd: 32
110
  valid_max_cache_size: null
111
  exclude_weight_decay: false
112
  exclude_weight_decay_conf: {}
113
- optim: adamw
114
  optim_conf:
115
- lr: 0.0002
116
  betas:
117
- - 0.8
118
- - 0.99
119
- eps: 1.0e-09
120
  weight_decay: 0.0
121
  scheduler: exponentiallr
122
  scheduler_conf:
123
  gamma: 0.999875
124
- optim2: adamw
125
  optim2_conf:
126
- lr: 0.0002
127
  betas:
128
- - 0.8
129
- - 0.99
130
- eps: 1.0e-09
131
  weight_decay: 0.0
132
  scheduler2: exponentiallr
133
  scheduler2_conf:
134
  gamma: 0.999875
135
- generator_first: false
136
  token_list:
137
  - <blank>
138
  - <unk>
@@ -154,14 +161,13 @@ token_list:
154
  - к
155
  - м
156
  - п
157
- - .
158
  - я
159
  - з
160
  - ','
161
  - б
162
  - ь
163
- - ч
164
  - г
 
165
  - й
166
  - ж
167
  - х
@@ -176,13 +182,12 @@ token_list:
176
  - '!'
177
  - ''''
178
  - ф
 
179
  - '"'
180
- - ':'
181
  - ґ
182
- - (
183
- - )
184
- - „
185
  - /
 
186
  - <sos/eos>
187
  odim: null
188
  model_conf: {}
@@ -192,54 +197,67 @@ bpemodel: null
192
  non_linguistic_symbols: null
193
  cleaner: null
194
  g2p: g2p_en
195
- feats_extract: linear_spectrogram
196
  feats_extract_conf:
197
  n_fft: 1024
198
  hop_length: 256
199
  win_length: null
200
- normalize: null
201
- normalize_conf: {}
202
- tts: vits
 
 
 
 
 
203
  tts_conf:
204
- generator_type: vits_generator
205
- generator_params:
206
- hidden_channels: 192
207
- spks: -1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  spk_embed_dim: 192
209
- global_channels: 256
210
- segment_size: 32
211
- text_encoder_attention_heads: 2
212
- text_encoder_ffn_expand: 4
213
- text_encoder_blocks: 6
214
- text_encoder_positionwise_layer_type: conv1d
215
- text_encoder_positionwise_conv_kernel_size: 3
216
- text_encoder_positional_encoding_layer_type: rel_pos
217
- text_encoder_self_attention_layer_type: rel_selfattn
218
- text_encoder_activation_type: swish
219
- text_encoder_normalize_before: true
220
- text_encoder_dropout_rate: 0.1
221
- text_encoder_positional_dropout_rate: 0.0
222
- text_encoder_attention_dropout_rate: 0.1
223
- use_macaron_style_in_text_encoder: true
224
- use_conformer_conv_in_text_encoder: false
225
- text_encoder_conformer_kernel_size: -1
226
- decoder_kernel_size: 7
227
- decoder_channels: 512
228
- decoder_upsample_scales:
229
- - 8
230
- - 8
231
- - 2
232
- - 2
233
- decoder_upsample_kernel_sizes:
234
- - 16
235
- - 16
236
- - 4
237
- - 4
238
- decoder_resblock_kernel_sizes:
239
- - 3
240
- - 7
241
- - 11
242
- decoder_resblock_dilations:
243
  - - 1
244
  - 3
245
  - 5
@@ -249,94 +267,90 @@ tts_conf:
249
  - - 1
250
  - 3
251
  - 5
252
- use_weight_norm_in_decoder: true
253
- posterior_encoder_kernel_size: 5
254
- posterior_encoder_layers: 16
255
- posterior_encoder_stacks: 1
256
- posterior_encoder_base_dilation: 1
257
- posterior_encoder_dropout_rate: 0.0
258
- use_weight_norm_in_posterior_encoder: true
259
- flow_flows: 4
260
- flow_kernel_size: 5
261
- flow_base_dilation: 1
262
- flow_layers: 4
263
- flow_dropout_rate: 0.0
264
- use_weight_norm_in_flow: true
265
- use_only_mean_in_flow: true
266
- stochastic_duration_predictor_kernel_size: 3
267
- stochastic_duration_predictor_dropout_rate: 0.5
268
- stochastic_duration_predictor_flows: 4
269
- stochastic_duration_predictor_dds_conv_layers: 3
270
- vocabs: 50
271
- aux_channels: 513
272
  discriminator_type: hifigan_multi_scale_multi_period_discriminator
273
  discriminator_params:
274
- scales: 1
275
- scale_downsample_pooling: AvgPool1d
276
- scale_downsample_pooling_params:
277
- kernel_size: 4
278
- stride: 2
279
- padding: 2
280
- scale_discriminator_params:
 
 
 
281
  in_channels: 1
282
- out_channels: 1
283
  kernel_sizes:
284
- - 15
285
- - 41
286
  - 5
287
  - 3
288
- channels: 128
289
  max_downsample_channels: 1024
290
- max_groups: 16
291
- bias: true
292
- downsample_scales:
293
- - 2
294
- - 2
295
- - 4
296
- - 4
297
- - 1
298
  nonlinear_activation: LeakyReLU
299
  nonlinear_activation_params:
300
  negative_slope: 0.1
301
- use_weight_norm: true
302
  use_spectral_norm: false
303
- follow_official_norm: false
304
  periods:
305
  - 2
306
  - 3
307
  - 5
308
  - 7
309
  - 11
310
- period_discriminator_params:
 
 
 
 
 
 
 
 
311
  in_channels: 1
312
- out_channels: 1
313
  kernel_sizes:
 
 
314
  - 5
315
  - 3
316
- channels: 32
317
- downsample_scales:
318
- - 3
319
- - 3
320
- - 3
321
- - 3
322
- - 1
323
  max_downsample_channels: 1024
324
- bias: true
325
  nonlinear_activation: LeakyReLU
326
  nonlinear_activation_params:
327
  negative_slope: 0.1
328
- use_weight_norm: true
329
- use_spectral_norm: false
 
 
 
 
 
330
  generator_adv_loss_params:
331
  average_by_discriminators: false
332
  loss_type: mse
333
  discriminator_adv_loss_params:
334
  average_by_discriminators: false
335
  loss_type: mse
 
336
  feat_match_loss_params:
337
  average_by_discriminators: false
338
  average_by_layers: false
339
  include_final_outputs: true
 
340
  mel_loss_params:
341
  fs: 22050
342
  n_fft: 1024
@@ -347,12 +361,12 @@ tts_conf:
347
  fmin: 0
348
  fmax: null
349
  log_base: null
 
350
  lambda_adv: 1.0
351
  lambda_mel: 45.0
352
  lambda_feat_match: 2.0
353
- lambda_dur: 1.0
354
- lambda_kl: 1.0
355
  sampling_rate: 22050
 
356
  cache_generator_outputs: true
357
  pitch_extract: null
358
  pitch_extract_conf: {}
 
1
+ config: ./conf/tuning/finetune_joint_tacotron2_hifigan.yaml
2
  print_config: false
3
  log_level: INFO
4
  dry_run: false
5
  iterator_type: sequence
6
+ output_dir: exp/22k/tts_finetune_joint_tacotron2_hifigan_raw_char
7
  ngpu: 1
8
+ seed: 777
9
  num_workers: 4
10
  num_att_plot: 3
11
  dist_backend: nccl
 
24
  cudnn_deterministic: false
25
  collect_stats: false
26
  write_collected_feats: false
27
+ max_epoch: 140
28
  patience: null
29
  val_scheduler_criterion:
30
  - valid
 
34
  - loss
35
  - min
36
  best_model_criterion:
37
+ - - valid
38
+ - text2mel_loss
39
+ - min
40
+ - - train
41
+ - text2mel_loss
42
+ - min
43
  - - train
44
  - total_count
45
  - max
46
+ keep_nbest_models: 5
47
  nbest_averaging_interval: 0
48
  grad_clip: -1
49
  grad_clip_type: 2.0
 
65
  wandb_model_log_interval: -1
66
  detect_anomaly: false
67
  pretrain_path: null
68
+ init_param:
69
+ - exp/22k/tts_train_tacotron2_raw_char/train.loss.ave_5best.pth:tts:tts.generator.text2mel
70
+ - exp/22k/ljspeech_hifigan.v1/generator.pth::tts.generator.vocoder
71
+ - exp/22k/ljspeech_hifigan.v1/discriminator.pth::tts.discriminator
72
  ignore_init_mismatch: false
73
  freeze_param: []
74
  num_iters_per_epoch: null
75
  batch_size: 20
76
  valid_batch_size: null
77
+ batch_bins: 1600000
78
  valid_batch_bins: null
79
  train_shape_file:
80
+ - exp/22k/tts_stats_raw_char/train/text_shape.char
81
+ - exp/22k/tts_stats_raw_char/train/speech_shape
82
  valid_shape_file:
83
+ - exp/22k/tts_stats_raw_char/valid/text_shape.char
84
+ - exp/22k/tts_stats_raw_char/valid/speech_shape
85
  batch_type: numel
86
  valid_batch_type: null
87
  fold_length:
 
119
  valid_max_cache_size: null
120
  exclude_weight_decay: false
121
  exclude_weight_decay_conf: {}
122
+ optim: adam
123
  optim_conf:
124
+ lr: 1.25e-05
125
  betas:
126
+ - 0.5
127
+ - 0.9
 
128
  weight_decay: 0.0
129
  scheduler: exponentiallr
130
  scheduler_conf:
131
  gamma: 0.999875
132
+ optim2: adam
133
  optim2_conf:
134
+ lr: 1.25e-05
135
  betas:
136
+ - 0.5
137
+ - 0.9
 
138
  weight_decay: 0.0
139
  scheduler2: exponentiallr
140
  scheduler2_conf:
141
  gamma: 0.999875
142
+ generator_first: true
143
  token_list:
144
  - <blank>
145
  - <unk>
 
161
  - к
162
  - м
163
  - п
 
164
  - я
165
  - з
166
  - ','
167
  - б
168
  - ь
 
169
  - г
170
+ - ч
171
  - й
172
  - ж
173
  - х
 
182
  - '!'
183
  - ''''
184
  - ф
185
+ - .
186
  - '"'
 
187
  - ґ
188
+ - ':'
 
 
189
  - /
190
+ - „
191
  - <sos/eos>
192
  odim: null
193
  model_conf: {}
 
197
  non_linguistic_symbols: null
198
  cleaner: null
199
  g2p: g2p_en
200
+ feats_extract: fbank
201
  feats_extract_conf:
202
  n_fft: 1024
203
  hop_length: 256
204
  win_length: null
205
+ fs: 22050
206
+ fmin: 80
207
+ fmax: 7600
208
+ n_mels: 80
209
+ normalize: global_mvn
210
+ normalize_conf:
211
+ stats_file: feats_stats.npz
212
+ tts: joint_text2wav
213
  tts_conf:
214
+ text2mel_type: tacotron2
215
+ text2mel_params:
216
+ embed_dim: 512
217
+ elayers: 1
218
+ eunits: 512
219
+ econv_layers: 3
220
+ econv_chans: 512
221
+ econv_filts: 5
222
+ atype: location
223
+ adim: 512
224
+ aconv_chans: 32
225
+ aconv_filts: 15
226
+ cumulate_att_w: true
227
+ dlayers: 2
228
+ dunits: 1024
229
+ prenet_layers: 2
230
+ prenet_units: 256
231
+ postnet_layers: 5
232
+ postnet_chans: 512
233
+ postnet_filts: 5
234
+ output_activation: null
235
+ use_batch_norm: true
236
+ use_concate: true
237
+ use_residual: false
238
  spk_embed_dim: 192
239
+ spk_embed_integration_type: add
240
+ dropout_rate: 0.5
241
+ zoneout_rate: 0.1
242
+ reduction_factor: 1
243
+ use_masking: true
244
+ bce_pos_weight: 10.0
245
+ use_guided_attn_loss: true
246
+ guided_attn_loss_sigma: 0.4
247
+ guided_attn_loss_lambda: 1.0
248
+ idim: 48
249
+ odim: 80
250
+ vocoder_type: hifigan_generator
251
+ vocoder_params:
252
+ bias: true
253
+ channels: 512
254
+ in_channels: 80
255
+ kernel_size: 7
256
+ nonlinear_activation: LeakyReLU
257
+ nonlinear_activation_params:
258
+ negative_slope: 0.1
259
+ out_channels: 1
260
+ resblock_dilations:
 
 
 
 
 
 
 
 
 
 
 
 
261
  - - 1
262
  - 3
263
  - 5
 
267
  - - 1
268
  - 3
269
  - 5
270
+ resblock_kernel_sizes:
271
+ - 3
272
+ - 7
273
+ - 11
274
+ upsample_kernel_sizes:
275
+ - 16
276
+ - 16
277
+ - 4
278
+ - 4
279
+ upsample_scales:
280
+ - 8
281
+ - 8
282
+ - 2
283
+ - 2
284
+ use_additional_convs: true
285
+ use_weight_norm: true
 
 
 
 
286
  discriminator_type: hifigan_multi_scale_multi_period_discriminator
287
  discriminator_params:
288
+ follow_official_norm: true
289
+ period_discriminator_params:
290
+ bias: true
291
+ channels: 32
292
+ downsample_scales:
293
+ - 3
294
+ - 3
295
+ - 3
296
+ - 3
297
+ - 1
298
  in_channels: 1
 
299
  kernel_sizes:
 
 
300
  - 5
301
  - 3
 
302
  max_downsample_channels: 1024
 
 
 
 
 
 
 
 
303
  nonlinear_activation: LeakyReLU
304
  nonlinear_activation_params:
305
  negative_slope: 0.1
306
+ out_channels: 1
307
  use_spectral_norm: false
308
+ use_weight_norm: true
309
  periods:
310
  - 2
311
  - 3
312
  - 5
313
  - 7
314
  - 11
315
+ scale_discriminator_params:
316
+ bias: true
317
+ channels: 128
318
+ downsample_scales:
319
+ - 4
320
+ - 4
321
+ - 4
322
+ - 4
323
+ - 1
324
  in_channels: 1
 
325
  kernel_sizes:
326
+ - 15
327
+ - 41
328
  - 5
329
  - 3
 
 
 
 
 
 
 
330
  max_downsample_channels: 1024
331
+ max_groups: 16
332
  nonlinear_activation: LeakyReLU
333
  nonlinear_activation_params:
334
  negative_slope: 0.1
335
+ out_channels: 1
336
+ scale_downsample_pooling: AvgPool1d
337
+ scale_downsample_pooling_params:
338
+ kernel_size: 4
339
+ padding: 2
340
+ stride: 2
341
+ scales: 3
342
  generator_adv_loss_params:
343
  average_by_discriminators: false
344
  loss_type: mse
345
  discriminator_adv_loss_params:
346
  average_by_discriminators: false
347
  loss_type: mse
348
+ use_feat_match_loss: true
349
  feat_match_loss_params:
350
  average_by_discriminators: false
351
  average_by_layers: false
352
  include_final_outputs: true
353
+ use_mel_loss: true
354
  mel_loss_params:
355
  fs: 22050
356
  n_fft: 1024
 
361
  fmin: 0
362
  fmax: null
363
  log_base: null
364
+ lambda_text2mel: 1.0
365
  lambda_adv: 1.0
366
  lambda_mel: 45.0
367
  lambda_feat_match: 2.0
 
 
368
  sampling_rate: 22050
369
+ segment_size: 32
370
  cache_generator_outputs: true
371
  pitch_extract: null
372
  pitch_extract_conf: {}
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  # requirements for HuggingFace demo. Installs local package.
2
  torch
3
- espnet>=202301
4
  typeguard<3 # typeguard 3.0.0 is incompatible with espnet
5
  git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772
6
  ukrainian-word-stress==1.0.2
 
1
  # requirements for HuggingFace demo. Installs local package.
2
  torch
3
+ espnet==202301
4
  typeguard<3 # typeguard 3.0.0 is incompatible with espnet
5
  git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772
6
  ukrainian-word-stress==1.0.2
setup.py CHANGED
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
 
4
  setup(
5
  name="ukrainian-tts",
6
- version="5.0",
7
  description="Ukrainian TTS using ESPNET",
8
  author="Yurii Paniv",
9
  author_email="[email protected]",
@@ -12,7 +12,7 @@ setup(
12
  packages=find_packages(),
13
  python_requires=">3.6.0",
14
  install_requires=[
15
- "espnet>=202301",
16
  "typeguard<3",
17
  "num2words @ git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772",
18
  "ukrainian-word-stress==1.0.2",
 
3
 
4
  setup(
5
  name="ukrainian-tts",
6
+ version="6.0",
7
  description="Ukrainian TTS using ESPNET",
8
  author="Yurii Paniv",
9
  author_email="[email protected]",
 
12
  packages=find_packages(),
13
  python_requires=">3.6.0",
14
  install_requires=[
15
+ "espnet==202301",
16
  "typeguard<3",
17
  "num2words @ git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772",
18
  "ukrainian-word-stress==1.0.2",
ukrainian_tts/tts.py CHANGED
@@ -19,6 +19,7 @@ class Voices(Enum):
19
  Mykyta = "mykyta"
20
  Lada = "lada"
21
  Dmytro = "dmytro"
 
22
 
23
 
24
  class Stress(Enum):
@@ -41,7 +42,7 @@ class TTS:
41
  self.device = device
42
  self.__setup_cache(cache_folder)
43
 
44
- def tts(self, text: str, voice: str, stress: str, output_fp=BytesIO(), speed=1.0):
45
  """
46
  Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
47
  - `text` - your model input text.
@@ -71,9 +72,7 @@ class TTS:
71
  # synthesis
72
  with no_grad():
73
  start = time.time()
74
- wav = self.synthesizer(
75
- text, spembs=self.xvectors[voice][0], decode_conf={"alpha": 1 / speed}
76
- )["wav"]
77
 
78
  rtf = (time.time() - start) / (len(wav) / self.synthesizer.fs)
79
  print(f"RTF = {rtf:5f}")
@@ -99,6 +98,7 @@ class TTS:
99
  model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model.pth"
100
  config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.yaml"
101
  speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/spk_xvector.ark"
 
102
 
103
  if cache_folder is None:
104
  cache_folder = "."
@@ -106,19 +106,16 @@ class TTS:
106
  model_path = join(cache_folder, "model.pth")
107
  config_path = join(cache_folder, "config.yaml")
108
  speakers_path = join(cache_folder, "spk_xvector.ark")
 
109
 
110
  self.__download(model_link, model_path)
111
  self.__download(config_link, config_path)
112
  self.__download(speakers_link, speakers_path)
 
113
  print("downloaded.")
114
 
115
  self.synthesizer = Text2Speech(
116
- train_config=config_path,
117
- model_file=model_path,
118
- device=self.device,
119
- # Only for VITS
120
- noise_scale=0.333,
121
- noise_scale_dur=0.333,
122
  )
123
  self.xvectors = {k: v for k, v in load_ark(speakers_path)}
124
 
 
19
  Mykyta = "mykyta"
20
  Lada = "lada"
21
  Dmytro = "dmytro"
22
+ Oleksa = "oleksa"
23
 
24
 
25
  class Stress(Enum):
 
42
  self.device = device
43
  self.__setup_cache(cache_folder)
44
 
45
+ def tts(self, text: str, voice: str, stress: str, output_fp=BytesIO()):
46
  """
47
  Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
48
  - `text` - your model input text.
 
72
  # synthesis
73
  with no_grad():
74
  start = time.time()
75
+ wav = self.synthesizer(text, spembs=self.xvectors[voice][0])["wav"]
 
 
76
 
77
  rtf = (time.time() - start) / (len(wav) / self.synthesizer.fs)
78
  print(f"RTF = {rtf:5f}")
 
98
  model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model.pth"
99
  config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.yaml"
100
  speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/spk_xvector.ark"
101
+ feat_stats_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/feat_stats.npz"
102
 
103
  if cache_folder is None:
104
  cache_folder = "."
 
106
  model_path = join(cache_folder, "model.pth")
107
  config_path = join(cache_folder, "config.yaml")
108
  speakers_path = join(cache_folder, "spk_xvector.ark")
109
+ feat_stats_path = join(cache_folder, "feats_stats.npz")
110
 
111
  self.__download(model_link, model_path)
112
  self.__download(config_link, config_path)
113
  self.__download(speakers_link, speakers_path)
114
+ self.__download(feat_stats_link, feat_stats_path)
115
  print("downloaded.")
116
 
117
  self.synthesizer = Text2Speech(
118
+ train_config=config_path, model_file=model_path, device=self.device
 
 
 
 
 
119
  )
120
  self.xvectors = {k: v for k, v in load_ark(speakers_path)}
121