Nick256
/

tts-tacotron2-commonvoice-single-female

@@ -1,126 +1,14 @@
-# Generated 2024-03-06 from:
-# /home/marconilab/tacotron2/hparams/train.yaml
-# yamllint disable
-############################################################################
-# Model: Tacotron2
-# Tokens: Raw characters (English text)
-# losses: Transducer
-# Training: LJSpeech
-# Authors: Georges Abous-Rjeili, Artem Ploujnikov, Yingzhi Wang
-# ############################################################################
-###################################
-# Experiment Parameters and setup #
-###################################
-seed: 1234
-__set_seed: !apply:torch.manual_seed [1234]
-output_folder: ./results/tacotron2/1234
-save_folder: ./results/tacotron2/1234/save
-train_log: ./results/tacotron2/1234/train_log.txt
-epochs: 500
-keep_checkpoint_interval: 50
-wandb_id: tacotron2-luganda
-wandb_user: sulaiman-kagumire
-wandb_project: tts-luganda
-init_from_pretrained: true
-###################################
-# Progress Samples                #
-###################################
-# Progress samples are used to monitor the progress
-# of an ongoing training session by outputting samples
-# of spectrograms, alignments, etc at regular intervals
-# Whether to enable progress samples
-progress_samples: false
-# The path where the samples will be stored
-progress_sample_path: ./results/tacotron2/1234/samples
-# The interval, in epochs. For instance, if it is set to 5,
-# progress samples will be output every 5 epochs
-progress_samples_interval: 1
-# The sample size for raw batch samples saved in batch.pth
-# (useful mostly for model debugging)
-progress_batch_sample_size: 3
-#################################
-# Data files and pre-processing #
-#################################
-data_folder: data_folder
-                          # e.g, /localscratch/ljspeech
-train_json: ./results/tacotron2/1234/save/train.json
-valid_json: ./results/tacotron2/1234/save/valid.json
-test_json: ./results/tacotron2/1234/save/test.json
-splits: [train, valid, test]
-split_ratio: [80, 10, 10]
-skip_prep: false
-# Use the original preprocessing from nvidia
-# The cleaners to be used (applicable to nvidia only)
-text_cleaners: [basic_cleaners]
-################################
-# Audio Parameters             #
-################################
-sample_rate: 22050
-hop_length: 256
-win_length: 1024
 n_mel_channels: 80
-n_fft: 1024
-mel_fmin: 0.0
-mel_fmax: 8000.0
-mel_normalized: false
-power: 1
-norm: slaney
-mel_scale: slaney
-dynamic_range_compression: true
-################################
-# Optimization Hyperparameters #
-################################
-learning_rate: 0.001
-weight_decay: 0.000006
-batch_size: 256
-num_workers: 8
-mask_padding: true
-guided_attention_sigma: 0.2
-guided_attention_weight: 50.0
-guided_attention_weight_half_life: 10.
-guided_attention_hard_stop: 50
-gate_loss_weight: 1.0
-train_dataloader_opts:
-  batch_size: 256
-  drop_last: false  #True #False
-  num_workers: 8
-  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
-valid_dataloader_opts:
-  batch_size: 256
-  num_workers: 8
-  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
-test_dataloader_opts:
-  batch_size: 256
-  num_workers: 8
-  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
-################################
-# Model Parameters and model   #
-################################
-n_symbols: 148 #fixed depending on symbols in textToSequence
 symbols_embedding_dim: 512
-# Encoder parameters
 encoder_kernel_size: 5
 encoder_n_convolutions: 3
 encoder_embedding_dim: 512
-# Decoder parameters
-# The number of frames in the target per encoder step
 n_frames_per_step: 1
 decoder_rnn_dim: 1024
 prenet_dim: 256
@@ -128,123 +16,49 @@ max_decoder_steps: 1000
 gate_threshold: 0.5
 p_attention_dropout: 0.1
 p_decoder_dropout: 0.1
-decoder_no_early_stopping: false
-# Attention parameters
-attention_rnn_dim: 1024
-attention_dim: 128
-# Location Layer parameters
-attention_location_n_filters: 32
-attention_location_kernel_size: 31
-# Mel-post processing network parameters
 postnet_embedding_dim: 512
 postnet_kernel_size: 5
 postnet_n_convolutions: 5
-mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram
-  sample_rate: 22050
-  hop_length: 256
-  win_length: 1024
-  n_fft: 1024
-  n_mels: 80
-  f_min: 0.0
-  f_max: 8000.0
-  power: 1
-  normalized: false
-  norm: slaney
-  mel_scale: slaney
-  compression: true
-#model
-model: &id002 !new:speechbrain.lobes.models.Tacotron2.Tacotron2
-#optimizer
-  mask_padding: true
-  n_mel_channels: 80
   # symbols
-  n_symbols: 148
-  symbols_embedding_dim: 512
   # encoder
-  encoder_kernel_size: 5
-  encoder_n_convolutions: 3
-  encoder_embedding_dim: 512
   # attention
-  attention_rnn_dim: 1024
-  attention_dim: 128
   # attention location
-  attention_location_n_filters: 32
-  attention_location_kernel_size: 31
   # decoder
-  n_frames_per_step: 1
-  decoder_rnn_dim: 1024
-  prenet_dim: 256
-  max_decoder_steps: 1000
-  gate_threshold: 0.5
-  p_attention_dropout: 0.1
-  p_decoder_dropout: 0.1
   # postnet
-  postnet_embedding_dim: 512
-  postnet_kernel_size: 5
-  postnet_n_convolutions: 5
-  decoder_no_early_stopping: false
-guided_attention_scheduler: &id001 !new:speechbrain.nnet.schedulers.StepScheduler
-  initial_value: 50.0
-  half_life: 10.
-criterion: !new:speechbrain.lobes.models.Tacotron2.Loss
-  gate_loss_weight: 1.0
-  guided_attention_weight: 50.0
-  guided_attention_sigma: 0.2
-  guided_attention_scheduler: *id001
-  guided_attention_hard_stop: 50
 modules:
-  model: *id002
-opt_class: !name:torch.optim.Adam
-  lr: 0.001
-  weight_decay: 0.000006
-#epoch object
-epoch_counter: &id003 !new:speechbrain.utils.epoch_loop.EpochCounter
-  limit: 500
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-  save_file: !ref <train_log>
-# train_logger: !new:speechbrain.utils.train_logger.WandBLogger
-#   initializer: !name:wandb.init
-#     # id: !ref <wandb_id>
-#   name: tacotron2-luganda
-#   entity: sulaiman-kagumire
-#   project: tts-luganda
-#   reinit: true
-#     # yaml_config: hparams/train.yaml
-#   resume: allow
-#annealing_function
-lr_annealing: &id004 !new:speechbrain.nnet.schedulers.IntervalScheduler
-#infer: !name:speechbrain.lobes.models.Tacotron2.infer
-  intervals:
-  - steps: 6000
-    lr: 0.0005
-  - steps: 8000
-    lr: 0.0003
-  - steps: 10000
-    lr: 0.0001
-#checkpointer
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-  checkpoints_dir: ./results/tacotron2/1234/save
-  recoverables:
-    model: *id002
-    counter: *id003
-    scheduler: *id004
-progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger
-  output_path: ./results/tacotron2/1234/samples
-  batch_sample_size: 3
-  formats:
-    raw_batch: raw

+mask_padding: True
 n_mel_channels: 80
+n_symbols: 148
 symbols_embedding_dim: 512
 encoder_kernel_size: 5
 encoder_n_convolutions: 3
 encoder_embedding_dim: 512
+attention_rnn_dim: 1024
+attention_dim: 128
+attention_location_n_filters: 32
+attention_location_kernel_size: 31
 n_frames_per_step: 1
 decoder_rnn_dim: 1024
 prenet_dim: 256
 gate_threshold: 0.5
 p_attention_dropout: 0.1
 p_decoder_dropout: 0.1
 postnet_embedding_dim: 512
 postnet_kernel_size: 5
 postnet_n_convolutions: 5
+decoder_no_early_stopping: False
+sample_rate: 22050
+# Model
+model: !new:speechbrain.lobes.models.Tacotron2.Tacotron2
+  mask_padding: !ref <mask_padding>
+  n_mel_channels: !ref <n_mel_channels>
   # symbols
+  n_symbols: !ref <n_symbols>
+  symbols_embedding_dim: !ref <symbols_embedding_dim>
   # encoder
+  encoder_kernel_size: !ref <encoder_kernel_size>
+  encoder_n_convolutions: !ref <encoder_n_convolutions>
+  encoder_embedding_dim: !ref <encoder_embedding_dim>
   # attention
+  attention_rnn_dim: !ref <attention_rnn_dim>
+  attention_dim: !ref <attention_dim>
   # attention location
+  attention_location_n_filters: !ref <attention_location_n_filters>
+  attention_location_kernel_size: !ref <attention_location_kernel_size>
   # decoder
+  n_frames_per_step: !ref <n_frames_per_step>
+  decoder_rnn_dim: !ref <decoder_rnn_dim>
+  prenet_dim: !ref <prenet_dim>
+  max_decoder_steps: !ref <max_decoder_steps>
+  gate_threshold: !ref <gate_threshold>
+  p_attention_dropout: !ref <p_attention_dropout>
+  p_decoder_dropout: !ref <p_decoder_dropout>
   # postnet
+  postnet_embedding_dim: !ref <postnet_embedding_dim>
+  postnet_kernel_size: !ref <postnet_kernel_size>
+  postnet_n_convolutions: !ref <postnet_n_convolutions>
+  decoder_no_early_stopping: !ref <decoder_no_early_stopping>
+# Function that converts the text into a sequence of valid characters.
+text_to_sequence: !name:speechbrain.utils.text_to_sequence.text_to_sequence
 modules:
+    model: !ref <model>
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    loadables:
+        model: !ref <model>