Spaces:

NATSpeech
/

DiffSpeech

Runtime error

Silentlin commited on Feb 15, 2022

Commit

493b8bb

•

1 Parent(s): b247641

fix fs config

Files changed (4) hide show

egs/egs_bases/tts/fs.yaml CHANGED Viewed

@@ -36,10 +36,10 @@ dec_post_net_kernel: 3
 # duration
 predictor_hidden: -1
-predictor_kernel: 5
-predictor_layers: 2
 dur_predictor_kernel: 3
 dur_predictor_layers: 2
 predictor_dropout: 0.5
 # pitch and energy

 # duration
 predictor_hidden: -1
 dur_predictor_kernel: 3
 dur_predictor_layers: 2
+predictor_kernel: 5
+predictor_layers: 5
 predictor_dropout: 0.5
 # pitch and energy

egs/egs_bases/tts/fs2_orig.yaml CHANGED Viewed

@@ -2,10 +2,12 @@ base_config: ./fs.yaml
 task_cls: tasks.tts.fs2_orig.FastSpeech2OrigTask
 encoder_type: fft
 decoder_type: fft
-use_energy_embed: true
 use_pitch_embed: true
 pitch_type: cwt # frame|ph|cwt
 binarization_args:
   with_f0cwt: true
 use_gt_energy: false
-cwt_std_scale: 1.0

 task_cls: tasks.tts.fs2_orig.FastSpeech2OrigTask
 encoder_type: fft
 decoder_type: fft
+use_energy_embed: false
 use_pitch_embed: true
 pitch_type: cwt # frame|ph|cwt
 binarization_args:
   with_f0cwt: true
 use_gt_energy: false
+cwt_std_scale: 0.8
+dropout: 0.1
+mel_losses: l1

modules/tts/fs2_orig.py CHANGED Viewed

@@ -17,12 +17,12 @@ class FastSpeech2Orig(FastSpeech):
             self.energy_embed = Embedding(300, self.hidden_size, 0)
             self.energy_predictor = EnergyPredictor(
                 self.hidden_size, n_chans=predictor_hidden,
-                n_layers=5, dropout_rate=0.1, odim=2,
                 kernel_size=hparams['predictor_kernel'])
         if hparams['pitch_type'] == 'cwt' and hparams['use_pitch_embed']:
             self.pitch_predictor = PitchPredictor(
                 self.hidden_size, n_chans=predictor_hidden,
-                n_layers=5, dropout_rate=0.1, odim=11,
                 kernel_size=hparams['predictor_kernel'])
             self.cwt_stats_layers = nn.Sequential(
                 nn.Linear(self.hidden_size, self.hidden_size), nn.ReLU(),
@@ -67,7 +67,7 @@ class FastSpeech2Orig(FastSpeech):
             decoder_inp = decoder_inp.detach() + self.hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach())
             pitch_padding = mel2ph == 0
             ret['cwt'] = cwt_out = self.pitch_predictor(decoder_inp)
-            stats_out = self.cwt_stats_layers(encoder_out[:, 0, :])  # [B, 2]
             mean = ret['f0_mean'] = stats_out[:, 0]
             std = ret['f0_std'] = stats_out[:, 1]
             cwt_spec = cwt_out[:, :, :10]

             self.energy_embed = Embedding(300, self.hidden_size, 0)
             self.energy_predictor = EnergyPredictor(
                 self.hidden_size, n_chans=predictor_hidden,
+                n_layers=hparams['predictor_layers'], dropout_rate=hparams['predictor_dropout'], odim=2,
                 kernel_size=hparams['predictor_kernel'])
         if hparams['pitch_type'] == 'cwt' and hparams['use_pitch_embed']:
             self.pitch_predictor = PitchPredictor(
                 self.hidden_size, n_chans=predictor_hidden,
+                n_layers=hparams['predictor_layers'], dropout_rate=hparams['predictor_dropout'], odim=11,
                 kernel_size=hparams['predictor_kernel'])
             self.cwt_stats_layers = nn.Sequential(
                 nn.Linear(self.hidden_size, self.hidden_size), nn.ReLU(),
             decoder_inp = decoder_inp.detach() + self.hparams['predictor_grad'] * (decoder_inp - decoder_inp.detach())
             pitch_padding = mel2ph == 0
             ret['cwt'] = cwt_out = self.pitch_predictor(decoder_inp)
+            stats_out = self.cwt_stats_layers(decoder_inp.mean(1))  # [B, 2]
             mean = ret['f0_mean'] = stats_out[:, 0]
             std = ret['f0_std'] = stats_out[:, 1]
             cwt_spec = cwt_out[:, :, :10]

tasks/tts/fs2_orig.py CHANGED Viewed

@@ -5,6 +5,8 @@ from tasks.tts.dataset_utils import FastSpeechDataset
 from tasks.tts.fs import FastSpeechTask
 from utils.commons.dataset_utils import collate_1d, collate_2d
 from utils.commons.hparams import hparams
 class FastSpeech2OrigDataset(FastSpeechDataset):
@@ -88,6 +90,24 @@ class FastSpeech2OrigTask(FastSpeechTask):
                                 f0=f0, uv=uv, energy=energy, infer=True)
             return output
     def add_pitch_loss(self, output, sample, losses):
         if hparams['pitch_type'] == 'cwt':
             cwt_spec = sample[f'cwt_spec']

 from tasks.tts.fs import FastSpeechTask
 from utils.commons.dataset_utils import collate_1d, collate_2d
 from utils.commons.hparams import hparams
+from utils.plot.plot import spec_to_figure
+import numpy as np
 class FastSpeech2OrigDataset(FastSpeechDataset):
                                 f0=f0, uv=uv, energy=energy, infer=True)
             return output
+    def save_valid_result(self, sample, batch_idx, model_out):
+        super(FastSpeech2OrigTask, self).save_valid_result(sample, batch_idx, model_out)
+        self.plot_cwt(batch_idx, model_out['cwt'], sample['cwt_spec'])
+    def plot_cwt(self, batch_idx, cwt_out, cwt_gt=None):
+        if len(cwt_out.shape) == 3:
+            cwt_out = cwt_out[0]
+        if isinstance(cwt_out, torch.Tensor):
+            cwt_out = cwt_out.cpu().numpy()
+        if cwt_gt is not None:
+            if len(cwt_gt.shape) == 3:
+                cwt_gt = cwt_gt[0]
+            if isinstance(cwt_gt, torch.Tensor):
+                cwt_gt = cwt_gt.cpu().numpy()
+            cwt_out = np.concatenate([cwt_out, cwt_gt], -1)
+        name = f'cwt_val_{batch_idx}'
+        self.logger.add_figure(name, spec_to_figure(cwt_out), self.global_step)
     def add_pitch_loss(self, output, sample, losses):
         if hparams['pitch_type'] == 'cwt':
             cwt_spec = sample[f'cwt_spec']