RayeRen commited on
Commit
d5e28e1
2 Parent(s): 4c1df9e 15e73a1

Merge branch 'main' into ps

Browse files
checkpoints/ps_normal_exp/config.yaml CHANGED
@@ -82,7 +82,6 @@ fvae_kernel_size: 5
82
  fvae_noise_scale: 1.0
83
  fvae_strides: 4
84
  gen_dir_name: ''
85
- glow_kernel_size: 3
86
  griffin_lim_iters: 30
87
  hidden_size: 192
88
  hop_size: 256
@@ -127,8 +126,6 @@ out_wav_norm: false
127
  pitch_extractor: parselmouth
128
  pitch_key: pitch
129
  pitch_type: frame
130
- post_decoder: false
131
- post_decoder_detach_ling: false
132
  post_flow_lr: 0.001
133
  post_glow_hidden: 192
134
  post_glow_kernel_size: 3
@@ -157,8 +154,9 @@ preprocess_args:
157
  with_phsep: true
158
  preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
159
  print_nan_grads: false
160
- prior_glow_hidden: 64
161
- prior_glow_n_blocks: 4
 
162
  processed_data_dir: data/processed/ljspeech
163
  profile_infer: false
164
  raw_data_dir: data/raw/LJSpeech-1.1
 
82
  fvae_noise_scale: 1.0
83
  fvae_strides: 4
84
  gen_dir_name: ''
 
85
  griffin_lim_iters: 30
86
  hidden_size: 192
87
  hop_size: 256
 
126
  pitch_extractor: parselmouth
127
  pitch_key: pitch
128
  pitch_type: frame
 
 
129
  post_flow_lr: 0.001
130
  post_glow_hidden: 192
131
  post_glow_kernel_size: 3
 
154
  with_phsep: true
155
  preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
156
  print_nan_grads: false
157
+ prior_flow_hidden: 64
158
+ prior_flow_kernel_size: 3
159
+ prior_flow_n_blocks: 4
160
  processed_data_dir: data/processed/ljspeech
161
  profile_infer: false
162
  raw_data_dir: data/raw/LJSpeech-1.1
checkpoints/ps_small_exp/config.yaml CHANGED
@@ -82,7 +82,6 @@ fvae_kernel_size: 3
82
  fvae_noise_scale: 1.0
83
  fvae_strides: 4
84
  gen_dir_name: ''
85
- glow_kernel_size: 3
86
  griffin_lim_iters: 30
87
  hidden_size: 128
88
  hop_size: 256
@@ -127,8 +126,6 @@ out_wav_norm: false
127
  pitch_extractor: parselmouth
128
  pitch_key: pitch
129
  pitch_type: frame
130
- post_decoder: false
131
- post_decoder_detach_ling: false
132
  post_flow_lr: 0.001
133
  post_glow_hidden: 128
134
  post_glow_kernel_size: 3
@@ -157,8 +154,9 @@ preprocess_args:
157
  with_phsep: true
158
  preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
159
  print_nan_grads: false
160
- prior_glow_hidden: 32
161
- prior_glow_n_blocks: 3
 
162
  processed_data_dir: data/processed/ljspeech
163
  profile_infer: false
164
  raw_data_dir: data/raw/LJSpeech-1.1
 
82
  fvae_noise_scale: 1.0
83
  fvae_strides: 4
84
  gen_dir_name: ''
 
85
  griffin_lim_iters: 30
86
  hidden_size: 128
87
  hop_size: 256
 
126
  pitch_extractor: parselmouth
127
  pitch_key: pitch
128
  pitch_type: frame
 
 
129
  post_flow_lr: 0.001
130
  post_glow_hidden: 128
131
  post_glow_kernel_size: 3
 
154
  with_phsep: true
155
  preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
156
  print_nan_grads: false
157
+ prior_flow_hidden: 32
158
+ prior_flow_kernel_size: 3
159
+ prior_flow_n_blocks: 3
160
  processed_data_dir: data/processed/ljspeech
161
  profile_infer: false
162
  raw_data_dir: data/raw/LJSpeech-1.1
egs/egs_bases/tts/ps.yaml CHANGED
@@ -38,14 +38,12 @@ fvae_enc_n_layers: 8
38
  fvae_dec_n_layers: 4
39
  fvae_strides: 4
40
  fvae_noise_scale: 1.0
41
- post_decoder: false
42
- post_decoder_detach_ling: false
43
 
44
  # prior flow
45
  use_prior_flow: true
46
- prior_glow_hidden: 64
47
- glow_kernel_size: 3
48
- prior_glow_n_blocks: 4
49
 
50
  ###########################
51
  # training and inference
 
38
  fvae_dec_n_layers: 4
39
  fvae_strides: 4
40
  fvae_noise_scale: 1.0
 
 
41
 
42
  # prior flow
43
  use_prior_flow: true
44
+ prior_flow_hidden: 64
45
+ prior_flow_kernel_size: 3
46
+ prior_flow_n_blocks: 4
47
 
48
  ###########################
49
  # training and inference
egs/egs_bases/tts/ps_flow_small.yaml CHANGED
@@ -30,9 +30,9 @@ fvae_noise_scale: 1.0
30
 
31
  # prior flow
32
  use_prior_flow: true
33
- prior_glow_hidden: 32
34
- glow_kernel_size: 3
35
- prior_glow_n_blocks: 3
36
  # post flow
37
  post_glow_hidden: 128
38
  post_glow_kernel_size: 3
 
30
 
31
  # prior flow
32
  use_prior_flow: true
33
+ prior_flow_hidden: 32
34
+ prior_flow_kernel_size: 3
35
+ prior_flow_n_blocks: 3
36
  # post flow
37
  post_glow_hidden: 128
38
  post_glow_kernel_size: 3
modules/tts/portaspeech/portaspeech.py CHANGED
@@ -74,9 +74,9 @@ class PortaSpeech(FastSpeech):
74
  dec_n_layers=hparams['fvae_dec_n_layers'],
75
  c_cond=self.hidden_size,
76
  use_prior_flow=hparams['use_prior_flow'],
77
- flow_hidden=hparams['prior_glow_hidden'],
78
- flow_kernel_size=hparams['glow_kernel_size'],
79
- flow_n_steps=hparams['prior_glow_n_blocks'],
80
  strides=[hparams['fvae_strides']],
81
  encoder_type=hparams['fvae_encoder_type'],
82
  decoder_type=hparams['fvae_decoder_type'],
@@ -88,11 +88,6 @@ class PortaSpeech(FastSpeech):
88
  self.pitch_embed = Embedding(300, self.hidden_size, 0)
89
  if self.hparams['add_word_pos']:
90
  self.word_pos_proj = Linear(self.hidden_size, self.hidden_size)
91
- if self.hparams['post_decoder']:
92
- self.post_decoder_proj_in = Linear(self.out_dims, self.hidden_size)
93
- self.post_decoder = ConditionalConvBlocks(
94
- self.hidden_size, self.hidden_size, self.out_dims, None,
95
- hparams['dec_kernel_size'], num_layers=4)
96
 
97
  def build_embedding(self, dictionary, embed_dim):
98
  num_embeddings = len(dictionary)
@@ -188,11 +183,6 @@ class PortaSpeech(FastSpeech):
188
  z = torch.randn_like(z)
189
  x_recon = self.fvae.decoder(z, nonpadding=tgt_nonpadding_BHT, cond=x).transpose(1, 2)
190
  ret['pre_mel_out'] = x_recon
191
- if self.hparams['post_decoder']:
192
- x_recon = self.post_decoder_proj_in(x_recon.detach())
193
- if self.hparams['post_decoder_detach_ling']:
194
- decoder_inp = decoder_inp.detach()
195
- x_recon = self.post_decoder(x_recon, decoder_inp) * tgt_nonpadding
196
  return x_recon
197
 
198
  def forward_dur(self, dur_input, mel2word, ret, **kwargs):
 
74
  dec_n_layers=hparams['fvae_dec_n_layers'],
75
  c_cond=self.hidden_size,
76
  use_prior_flow=hparams['use_prior_flow'],
77
+ flow_hidden=hparams['prior_flow_hidden'],
78
+ flow_kernel_size=hparams['prior_flow_kernel_size'],
79
+ flow_n_steps=hparams['prior_flow_n_blocks'],
80
  strides=[hparams['fvae_strides']],
81
  encoder_type=hparams['fvae_encoder_type'],
82
  decoder_type=hparams['fvae_decoder_type'],
 
88
  self.pitch_embed = Embedding(300, self.hidden_size, 0)
89
  if self.hparams['add_word_pos']:
90
  self.word_pos_proj = Linear(self.hidden_size, self.hidden_size)
 
 
 
 
 
91
 
92
  def build_embedding(self, dictionary, embed_dim):
93
  num_embeddings = len(dictionary)
 
183
  z = torch.randn_like(z)
184
  x_recon = self.fvae.decoder(z, nonpadding=tgt_nonpadding_BHT, cond=x).transpose(1, 2)
185
  ret['pre_mel_out'] = x_recon
 
 
 
 
 
186
  return x_recon
187
 
188
  def forward_dur(self, dur_input, mel2word, ret, **kwargs):
tasks/tts/ps.py CHANGED
@@ -58,8 +58,6 @@ class PortaSpeechTask(FastSpeechTask):
58
  losses_kl = min(self.global_step / hparams['kl_start_steps'], 1) * losses_kl
59
  losses_kl = losses_kl * hparams['lambda_kl']
60
  losses['kl'] = losses_kl
61
- if hparams['post_decoder']:
62
- self.add_mel_loss(output['pre_mel_out'], sample['mels'], losses, '_post')
63
  self.add_mel_loss(output['mel_out'], sample['mels'], losses)
64
  if hparams['dur_level'] == 'word':
65
  self.add_dur_loss(
 
58
  losses_kl = min(self.global_step / hparams['kl_start_steps'], 1) * losses_kl
59
  losses_kl = losses_kl * hparams['lambda_kl']
60
  losses['kl'] = losses_kl
 
 
61
  self.add_mel_loss(output['mel_out'], sample['mels'], losses)
62
  if hparams['dur_level'] == 'word':
63
  self.add_dur_loss(