Edresson commited on
Commit
a2877d4
β€’
1 Parent(s): a1d8f54
Files changed (30) hide show
  1. Experiments/nohup.out +2 -2
  2. Experiments/run/events.out.tfevents.1706462806.edresson-train-80.145564.0 +2 -2
  3. Experiments/{runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt β†’ run/events.out.tfevents.1706899297.edresson-train-80-3.1052.0} +2 -2
  4. Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/best_model.pth +2 -2
  5. Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/best_model_195001.pth} +2 -2
  6. Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/config.json +6 -6
  7. Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/language_ids.json +0 -0
  8. Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/speakers.pth +0 -0
  9. Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/train_syntacc_baseline.py +3 -1
  10. Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/trainer_0_log.txt +2 -2
  11. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth +0 -3
  12. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth +0 -3
  13. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth +0 -3
  14. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth +0 -3
  15. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py +0 -352
  16. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json +0 -496
  17. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json +0 -15
  18. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth +0 -3
  19. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py +0 -352
  20. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth +0 -3
  21. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth +0 -3
  22. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth +0 -3
  23. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth +0 -3
  24. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json +0 -496
  25. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json +0 -15
  26. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth +0 -3
  27. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt +0 -3
  28. Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/{checkpoint_185000.pth β†’ checkpoint_195000.pth} +1 -1
  29. Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/trainer_0_log.txt +2 -2
  30. Experiments/train_syntacc_baseline.py +1 -1
Experiments/nohup.out CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d10475e5d035b7e6fcf8289e9069f3dd25e6285616e228cbd23ff95b48dba11
3
- size 18092959
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f3ff491be1a22770ad6be06a4ab637e3ee1fdd7ab56a46d56b6ee5ce294191a
3
+ size 19098782
Experiments/run/events.out.tfevents.1706462806.edresson-train-80.145564.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce36d2c723c356665a705554a83e2b8142863730587e4f854c58a08781c9696c
3
- size 573377595
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edf473f639006f00be06083dcda982e19ad249445299bba3ccfa9d3c3be668c9
3
+ size 603478571
Experiments/{runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt β†’ run/events.out.tfevents.1706899297.edresson-train-80-3.1052.0} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9eb020abfc0ef9798a6097596138d1567d58429ca6c2ce6e59b350acc5301cff
3
- size 1771305
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f92cb9921885f7784782d7c4cf4983bd9ebf92511857b363ad6c4a213d77e7fb
3
+ size 1426573
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/best_model.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a4a050e0d7a9c6c302b70b3f59dc195b12ad8922988de81bae55cbc1a89b9c8
3
- size 347719275
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a8ca0385eb8c2d74471a308ead9447f46334969a793ff980a527783b55f6571
3
+ size 347720178
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/best_model_195001.pth} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a4a050e0d7a9c6c302b70b3f59dc195b12ad8922988de81bae55cbc1a89b9c8
3
- size 347719275
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a8ca0385eb8c2d74471a308ead9447f46334969a793ff980a527783b55f6571
3
+ size 347720178
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/config.json RENAMED
@@ -397,16 +397,16 @@
397
  ],
398
  "use_sdp": true,
399
  "noise_scale": 1.0,
400
- "inference_noise_scale": 0.667,
401
  "length_scale": 1,
402
  "noise_scale_dp": 1.0,
403
- "inference_noise_scale_dp": 1.0,
404
  "max_inference_len": null,
405
  "init_discriminator": true,
406
  "use_spectral_norm_disriminator": false,
407
  "use_speaker_embedding": false,
408
  "num_speakers": 0,
409
- "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth",
410
  "d_vector_file": [
411
  "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
412
  "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
@@ -429,7 +429,7 @@
429
  "use_language_embedding": true,
430
  "embedded_language_dim": 4,
431
  "num_languages": 0,
432
- "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json",
433
  "use_speaker_encoder_as_loss": false,
434
  "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
435
  "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
@@ -472,9 +472,9 @@
472
  "r": 1,
473
  "num_speakers": 0,
474
  "use_speaker_embedding": false,
475
- "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth",
476
  "speaker_embedding_channels": 256,
477
- "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json",
478
  "use_language_embedding": true,
479
  "use_d_vector_file": true,
480
  "d_vector_file": [
 
397
  ],
398
  "use_sdp": true,
399
  "noise_scale": 1.0,
400
+ "inference_noise_scale": 0.33,
401
  "length_scale": 1,
402
  "noise_scale_dp": 1.0,
403
+ "inference_noise_scale_dp": 0.33,
404
  "max_inference_len": null,
405
  "init_discriminator": true,
406
  "use_spectral_norm_disriminator": false,
407
  "use_speaker_embedding": false,
408
  "num_speakers": 0,
409
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/speakers.pth",
410
  "d_vector_file": [
411
  "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
412
  "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
 
429
  "use_language_embedding": true,
430
  "embedded_language_dim": 4,
431
  "num_languages": 0,
432
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/language_ids.json",
433
  "use_speaker_encoder_as_loss": false,
434
  "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
435
  "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
 
472
  "r": 1,
473
  "num_speakers": 0,
474
  "use_speaker_embedding": false,
475
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/speakers.pth",
476
  "speaker_embedding_channels": 256,
477
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/language_ids.json",
478
  "use_language_embedding": true,
479
  "use_d_vector_file": true,
480
  "d_vector_file": [
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/language_ids.json RENAMED
File without changes
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/speakers.pth RENAMED
File without changes
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/train_syntacc_baseline.py RENAMED
@@ -28,7 +28,7 @@ RUN_NAME = "YourTTS-Baseline-PT"
28
  OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
 
30
  # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
- RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
 
33
  # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
  SKIP_TRAIN_EPOCH = False
@@ -221,6 +221,8 @@ audio_config = VitsAudioConfig(
221
 
222
  # Init VITSArgs setting the arguments that are needed for the YourTTS model
223
  model_args = VitsArgs(
 
 
224
  spec_segment_size=62,
225
  hidden_channels=192,
226
  hidden_channels_ffn_text_encoder=768,
 
28
  OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
 
30
  # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
+ RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/checkpoint_195000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
 
33
  # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
  SKIP_TRAIN_EPOCH = False
 
221
 
222
  # Init VITSArgs setting the arguments that are needed for the YourTTS model
223
  model_args = VitsArgs(
224
+ inference_noise_scale=0.33,
225
+ inference_noise_scale_dp=0.33,
226
  spec_segment_size=62,
227
  hidden_channels=192,
228
  hidden_channels_ffn_text_encoder=768,
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β†’ YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/trainer_0_log.txt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94c095ee47fd6e763ee0e129a7728cf80e5e4f21301e767ab0141c478d369b89
3
- size 128993
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:800fa1ba79843ee3494b41dbc8ffa45c6f147a7eb369e72260cbc0a5ce75dd72
3
+ size 135592
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
3
- size 1043220702
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
3
- size 1043220702
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a71ead47e605fc525b264ad882fd54630c15a42eb69aaf88993d26d5ea84ae3b
3
- size 1043220766
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:96e16ee83729813041c17f6edf8a702bdf59e7afe345cfad1fe65dd4ba0b1fce
3
- size 1043220766
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py DELETED
@@ -1,352 +0,0 @@
1
- import os
2
-
3
- import torch
4
- from trainer import Trainer, TrainerArgs
5
-
6
- from TTS.bin.compute_embeddings import compute_embeddings
7
- from TTS.bin.resample import resample_files
8
- from TTS.config.shared_configs import BaseDatasetConfig
9
- from TTS.tts.configs.vits_config import VitsConfig
10
- from TTS.tts.datasets import load_tts_samples
11
- from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
12
- from TTS.utils.downloaders import download_libri_tts
13
- from torch.utils.data import DataLoader
14
- from TTS.utils.samplers import PerfectBatchSampler
15
- torch.set_num_threads(24)
16
-
17
- # pylint: disable=W0105
18
- """
19
- This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
20
- YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
21
- """
22
- CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
23
-
24
- # Name of the run for the Trainer
25
- RUN_NAME = "YourTTS-Baseline-PT"
26
-
27
- # Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
28
- OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
-
30
- # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
- RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
-
33
- # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
- SKIP_TRAIN_EPOCH = False
35
-
36
- # Set here the batch size to be used in training and evaluation
37
- BATCH_SIZE = 26
38
-
39
- # Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
40
- # Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
41
- SAMPLE_RATE = 16000
42
-
43
-
44
- DASHBOARD_LOGGER="tensorboard"
45
- LOGGER_URI = None
46
-
47
- DASHBOARD_LOGGER = "clearml"
48
- LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
49
-
50
-
51
-
52
- # Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
53
- MAX_AUDIO_LEN_IN_SECONDS = float("inf")
54
-
55
- # Define here the datasets config
56
- brpb_train_config = BaseDatasetConfig(
57
- formatter="coqui",
58
- dataset_name="mupe",
59
- meta_file_train="metadata_coqui_brpb.csv",
60
- path="/raid/datasets/MUPE/dataset/mupe/",
61
- language="brpb"
62
- )
63
-
64
- brba_train_config = BaseDatasetConfig(
65
- formatter="coqui",
66
- dataset_name="mupe",
67
- meta_file_train="metadata_coqui_brba.csv",
68
- path="/raid/datasets/MUPE/dataset/mupe/",
69
- language="brba"
70
- )
71
-
72
- brportugal_train_config = BaseDatasetConfig(
73
- formatter="coqui",
74
- dataset_name="mupe",
75
- meta_file_train="metadata_coqui_brportugal.csv",
76
- path="/raid/datasets/MUPE/dataset/mupe/",
77
- language="brportugal"
78
- )
79
-
80
- brsp_train_config = BaseDatasetConfig(
81
- formatter="coqui",
82
- dataset_name="mupe",
83
- meta_file_train="metadata_coqui_brsp.csv",
84
- path="/raid/datasets/MUPE/dataset/mupe/",
85
- language="brsp"
86
- )
87
-
88
- brpe_train_config = BaseDatasetConfig(
89
- formatter="coqui",
90
- dataset_name="mupe",
91
- meta_file_train="metadata_coqui_brpe.csv",
92
- path="/raid/datasets/MUPE/dataset/mupe/",
93
- language="brpe"
94
- )
95
-
96
- brmg_train_config = BaseDatasetConfig(
97
- formatter="coqui",
98
- dataset_name="mupe",
99
- meta_file_train="metadata_coqui_brmg.csv",
100
- path="/raid/datasets/MUPE/dataset/mupe/",
101
- language="brmg"
102
- )
103
-
104
- brrj_train_config = BaseDatasetConfig(
105
- formatter="coqui",
106
- dataset_name="mupe",
107
- meta_file_train="metadata_coqui_brrj.csv",
108
- path="/raid/datasets/MUPE/dataset/mupe/",
109
- language="brrj"
110
- )
111
-
112
- brce_train_config = BaseDatasetConfig(
113
- formatter="coqui",
114
- dataset_name="mupe",
115
- meta_file_train="metadata_coqui_brce.csv",
116
- path="/raid/datasets/MUPE/dataset/mupe/",
117
- language="brce"
118
- )
119
-
120
- brrs_train_config = BaseDatasetConfig(
121
- formatter="coqui",
122
- dataset_name="mupe",
123
- meta_file_train="metadata_coqui_brrs.csv",
124
- path="/raid/datasets/MUPE/dataset/mupe/",
125
- language="brrs"
126
- )
127
-
128
- bralemanha_train_config = BaseDatasetConfig(
129
- formatter="coqui",
130
- dataset_name="mupe",
131
- meta_file_train="metadata_coqui_bralemanha.csv",
132
- path="/raid/datasets/MUPE/dataset/mupe/",
133
- language="bralemanha"
134
- )
135
-
136
- brgo_train_config = BaseDatasetConfig(
137
- formatter="coqui",
138
- dataset_name="mupe",
139
- meta_file_train="metadata_coqui_brgo.csv",
140
- path="/raid/datasets/MUPE/dataset/mupe/",
141
- language="brgo"
142
- )
143
-
144
- bral_train_config = BaseDatasetConfig(
145
- formatter="coqui",
146
- dataset_name="mupe",
147
- meta_file_train="metadata_coqui_bral.csv",
148
- path="/raid/datasets/MUPE/dataset/mupe/",
149
- language="bral"
150
- )
151
-
152
- brpr_train_config = BaseDatasetConfig(
153
- formatter="coqui",
154
- dataset_name="mupe",
155
- meta_file_train="metadata_coqui_brpr.csv",
156
- path="/raid/datasets/MUPE/dataset/mupe/",
157
- language="brpr"
158
- )
159
-
160
- bres_train_config = BaseDatasetConfig(
161
- formatter="coqui",
162
- dataset_name="mupe",
163
- meta_file_train="metadata_coqui_bres.csv",
164
- path="/raid/datasets/MUPE/dataset/mupe/",
165
- language="bres"
166
- )
167
-
168
- brpi_train_config = BaseDatasetConfig(
169
- formatter="coqui",
170
- dataset_name="mupe",
171
- meta_file_train="metadata_coqui_brpi.csv",
172
- path="/raid/datasets/MUPE/dataset/mupe/",
173
- language="brpi"
174
- )
175
-
176
- # bres_train_config, brpi_train_config no files found
177
- DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
178
-
179
-
180
- ### Extract speaker embeddings
181
- SPEAKER_ENCODER_CHECKPOINT_PATH = (
182
- "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
183
- )
184
- SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
185
-
186
- D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
187
-
188
- # Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
189
- for dataset_conf in DATASETS_CONFIG_LIST:
190
- # Check if the embeddings weren't already computed, if not compute it
191
- embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
192
- if not os.path.isfile(embeddings_file):
193
- print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
194
- compute_embeddings(
195
- SPEAKER_ENCODER_CHECKPOINT_PATH,
196
- SPEAKER_ENCODER_CONFIG_PATH,
197
- embeddings_file,
198
- old_speakers_file=None,
199
- config_dataset_path=None,
200
- formatter_name=dataset_conf.formatter,
201
- dataset_name=dataset_conf.dataset_name,
202
- dataset_path=dataset_conf.path,
203
- meta_file_train=dataset_conf.meta_file_train,
204
- meta_file_val=dataset_conf.meta_file_val,
205
- disable_cuda=False,
206
- no_eval=False,
207
- )
208
- D_VECTOR_FILES.append(embeddings_file)
209
-
210
-
211
- # Audio config used in training.
212
- audio_config = VitsAudioConfig(
213
- sample_rate=SAMPLE_RATE,
214
- hop_length=256,
215
- win_length=1024,
216
- fft_size=1024,
217
- mel_fmin=0.0,
218
- mel_fmax=None,
219
- num_mels=80,
220
- )
221
-
222
- # Init VITSArgs setting the arguments that are needed for the YourTTS model
223
- model_args = VitsArgs(
224
- spec_segment_size=62,
225
- hidden_channels=192,
226
- hidden_channels_ffn_text_encoder=768,
227
- num_heads_text_encoder=2,
228
- num_layers_text_encoder=10,
229
- kernel_size_text_encoder=3,
230
- dropout_p_text_encoder=0.1,
231
- d_vector_file=D_VECTOR_FILES,
232
- use_d_vector_file=True,
233
- d_vector_dim=512,
234
- speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
235
- speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
236
- resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
237
- # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
238
- use_speaker_encoder_as_loss=False,
239
- # Useful parameters to enable multilingual training
240
- use_language_embedding=True,
241
- embedded_language_dim=4,
242
- use_adaptive_weight_text_encoder=False,
243
- use_perfect_class_batch_sampler=True,
244
- perfect_class_batch_sampler_key="language"
245
- )
246
-
247
- # General training config, here you can change the batch size and others useful parameters
248
- config = VitsConfig(
249
- output_path=OUT_PATH,
250
- model_args=model_args,
251
- run_name=RUN_NAME,
252
- project_name="SYNTACC",
253
- run_description="""
254
- - YourTTS with SYNTACC text encoder
255
- """,
256
- dashboard_logger=DASHBOARD_LOGGER,
257
- logger_uri=LOGGER_URI,
258
- audio=audio_config,
259
- batch_size=BATCH_SIZE,
260
- batch_group_size=48,
261
- eval_batch_size=BATCH_SIZE,
262
- num_loader_workers=8,
263
- eval_split_max_size=256,
264
- print_step=50,
265
- plot_step=100,
266
- log_model_step=1000,
267
- save_step=5000,
268
- save_n_checkpoints=2,
269
- save_checkpoints=True,
270
- # target_loss="loss_1",
271
- print_eval=False,
272
- use_phonemes=False,
273
- phonemizer="espeak",
274
- phoneme_language="en",
275
- compute_input_seq_cache=True,
276
- add_blank=True,
277
- text_cleaner="multilingual_cleaners",
278
- characters=CharactersConfig(
279
- characters_class="TTS.tts.models.vits.VitsCharacters",
280
- pad="_",
281
- eos="&",
282
- bos="*",
283
- blank=None,
284
- characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
285
- punctuations="\u2014!'(),-.:;?\u00bf ",
286
- phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
287
- is_unique=True,
288
- is_sorted=True,
289
- ),
290
- phoneme_cache_path=None,
291
- precompute_num_workers=12,
292
- start_by_longest=True,
293
- datasets=DATASETS_CONFIG_LIST,
294
- cudnn_benchmark=False,
295
- max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
296
- mixed_precision=False,
297
- test_sentences=[
298
- #GUSTAVO: apenas pessoas do treino
299
- ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
300
- ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
301
- ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
302
- ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
303
- ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
304
- ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
305
- ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
306
- ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
307
- ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
308
- ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
309
- ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
310
- ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
311
- ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
312
- # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
313
- # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
314
- ],
315
- # Enable the weighted sampler
316
- use_weighted_sampler=True,
317
- # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
318
- # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
319
- weighted_sampler_attrs={"language": 1.0},
320
- weighted_sampler_multipliers={
321
- # "speaker_name": {
322
- # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
323
- # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
324
- # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
325
- # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
326
- # }
327
- },
328
- # It defines the Speaker Consistency Loss (SCL) Ξ± to 9 like the YourTTS paper
329
- speaker_encoder_loss_alpha=9.0,
330
- )
331
-
332
- # Load all the datasets samples and split traning and evaluation sets
333
- train_samples, eval_samples = load_tts_samples(
334
- config.datasets,
335
- eval_split=True,
336
- eval_split_max_size=config.eval_split_max_size,
337
- eval_split_size=config.eval_split_size,
338
- )
339
-
340
- # Init the model
341
- model = Vits.init_from_config(config)
342
-
343
- # Init the trainer and πŸš€
344
- trainer = Trainer(
345
- TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
346
- config,
347
- output_path=OUT_PATH,
348
- model=model,
349
- train_samples=train_samples,
350
- eval_samples=eval_samples,
351
- )
352
- trainer.fit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json DELETED
@@ -1,496 +0,0 @@
1
- {
2
- "output_path": "/raid/datasets/MUPE/Experiments/runs",
3
- "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
4
- "run_name": "YourTTS-Baseline-PT",
5
- "project_name": "SYNTACC",
6
- "run_description": "\n - YourTTS with SYNTACC text encoder\n ",
7
- "print_step": 50,
8
- "plot_step": 100,
9
- "model_param_stats": false,
10
- "wandb_entity": null,
11
- "dashboard_logger": "clearml",
12
- "save_on_interrupt": true,
13
- "log_model_step": 1000,
14
- "save_step": 5000,
15
- "save_n_checkpoints": 2,
16
- "save_checkpoints": true,
17
- "save_all_best": false,
18
- "save_best_after": 10000,
19
- "target_loss": null,
20
- "print_eval": false,
21
- "test_delay_epochs": 0,
22
- "run_eval": true,
23
- "run_eval_steps": null,
24
- "distributed_backend": "nccl",
25
- "distributed_url": "tcp://localhost:54321",
26
- "mixed_precision": false,
27
- "precision": "fp16",
28
- "epochs": 1000,
29
- "batch_size": 26,
30
- "eval_batch_size": 26,
31
- "grad_clip": [
32
- 1000,
33
- 1000
34
- ],
35
- "scheduler_after_epoch": true,
36
- "lr": 0.001,
37
- "optimizer": "AdamW",
38
- "optimizer_params": {
39
- "betas": [
40
- 0.8,
41
- 0.99
42
- ],
43
- "eps": 1e-09,
44
- "weight_decay": 0.01
45
- },
46
- "lr_scheduler": null,
47
- "lr_scheduler_params": {},
48
- "use_grad_scaler": false,
49
- "allow_tf32": false,
50
- "cudnn_enable": true,
51
- "cudnn_deterministic": false,
52
- "cudnn_benchmark": false,
53
- "training_seed": 54321,
54
- "model": "vits",
55
- "num_loader_workers": 8,
56
- "num_eval_loader_workers": 0,
57
- "use_noise_augment": false,
58
- "audio": {
59
- "fft_size": 1024,
60
- "sample_rate": 16000,
61
- "win_length": 1024,
62
- "hop_length": 256,
63
- "num_mels": 80,
64
- "mel_fmin": 0.0,
65
- "mel_fmax": null
66
- },
67
- "use_phonemes": false,
68
- "phonemizer": "espeak",
69
- "phoneme_language": "en",
70
- "compute_input_seq_cache": true,
71
- "text_cleaner": "multilingual_cleaners",
72
- "enable_eos_bos_chars": false,
73
- "test_sentences_file": "",
74
- "phoneme_cache_path": null,
75
- "characters": {
76
- "characters_class": "TTS.tts.models.vits.VitsCharacters",
77
- "vocab_dict": null,
78
- "pad": "_",
79
- "eos": "&",
80
- "bos": "*",
81
- "blank": null,
82
- "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
83
- "punctuations": "\u2014!'(),-.:;?\u00bf ",
84
- "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
85
- "is_unique": true,
86
- "is_sorted": true
87
- },
88
- "add_blank": true,
89
- "batch_group_size": 48,
90
- "loss_masking": null,
91
- "min_audio_len": 1,
92
- "max_audio_len": Infinity,
93
- "min_text_len": 1,
94
- "max_text_len": Infinity,
95
- "compute_f0": false,
96
- "compute_energy": false,
97
- "compute_linear_spec": true,
98
- "precompute_num_workers": 12,
99
- "start_by_longest": true,
100
- "shuffle": false,
101
- "drop_last": false,
102
- "datasets": [
103
- {
104
- "formatter": "coqui",
105
- "dataset_name": "mupe",
106
- "path": "/raid/datasets/MUPE/dataset/mupe/",
107
- "meta_file_train": "metadata_coqui_brpb.csv",
108
- "ignored_speakers": null,
109
- "language": "brpb",
110
- "phonemizer": "",
111
- "meta_file_val": "",
112
- "meta_file_attn_mask": ""
113
- },
114
- {
115
- "formatter": "coqui",
116
- "dataset_name": "mupe",
117
- "path": "/raid/datasets/MUPE/dataset/mupe/",
118
- "meta_file_train": "metadata_coqui_brba.csv",
119
- "ignored_speakers": null,
120
- "language": "brba",
121
- "phonemizer": "",
122
- "meta_file_val": "",
123
- "meta_file_attn_mask": ""
124
- },
125
- {
126
- "formatter": "coqui",
127
- "dataset_name": "mupe",
128
- "path": "/raid/datasets/MUPE/dataset/mupe/",
129
- "meta_file_train": "metadata_coqui_brportugal.csv",
130
- "ignored_speakers": null,
131
- "language": "brportugal",
132
- "phonemizer": "",
133
- "meta_file_val": "",
134
- "meta_file_attn_mask": ""
135
- },
136
- {
137
- "formatter": "coqui",
138
- "dataset_name": "mupe",
139
- "path": "/raid/datasets/MUPE/dataset/mupe/",
140
- "meta_file_train": "metadata_coqui_brsp.csv",
141
- "ignored_speakers": null,
142
- "language": "brsp",
143
- "phonemizer": "",
144
- "meta_file_val": "",
145
- "meta_file_attn_mask": ""
146
- },
147
- {
148
- "formatter": "coqui",
149
- "dataset_name": "mupe",
150
- "path": "/raid/datasets/MUPE/dataset/mupe/",
151
- "meta_file_train": "metadata_coqui_brpe.csv",
152
- "ignored_speakers": null,
153
- "language": "brpe",
154
- "phonemizer": "",
155
- "meta_file_val": "",
156
- "meta_file_attn_mask": ""
157
- },
158
- {
159
- "formatter": "coqui",
160
- "dataset_name": "mupe",
161
- "path": "/raid/datasets/MUPE/dataset/mupe/",
162
- "meta_file_train": "metadata_coqui_brmg.csv",
163
- "ignored_speakers": null,
164
- "language": "brmg",
165
- "phonemizer": "",
166
- "meta_file_val": "",
167
- "meta_file_attn_mask": ""
168
- },
169
- {
170
- "formatter": "coqui",
171
- "dataset_name": "mupe",
172
- "path": "/raid/datasets/MUPE/dataset/mupe/",
173
- "meta_file_train": "metadata_coqui_brrj.csv",
174
- "ignored_speakers": null,
175
- "language": "brrj",
176
- "phonemizer": "",
177
- "meta_file_val": "",
178
- "meta_file_attn_mask": ""
179
- },
180
- {
181
- "formatter": "coqui",
182
- "dataset_name": "mupe",
183
- "path": "/raid/datasets/MUPE/dataset/mupe/",
184
- "meta_file_train": "metadata_coqui_brce.csv",
185
- "ignored_speakers": null,
186
- "language": "brce",
187
- "phonemizer": "",
188
- "meta_file_val": "",
189
- "meta_file_attn_mask": ""
190
- },
191
- {
192
- "formatter": "coqui",
193
- "dataset_name": "mupe",
194
- "path": "/raid/datasets/MUPE/dataset/mupe/",
195
- "meta_file_train": "metadata_coqui_brrs.csv",
196
- "ignored_speakers": null,
197
- "language": "brrs",
198
- "phonemizer": "",
199
- "meta_file_val": "",
200
- "meta_file_attn_mask": ""
201
- },
202
- {
203
- "formatter": "coqui",
204
- "dataset_name": "mupe",
205
- "path": "/raid/datasets/MUPE/dataset/mupe/",
206
- "meta_file_train": "metadata_coqui_bralemanha.csv",
207
- "ignored_speakers": null,
208
- "language": "bralemanha",
209
- "phonemizer": "",
210
- "meta_file_val": "",
211
- "meta_file_attn_mask": ""
212
- },
213
- {
214
- "formatter": "coqui",
215
- "dataset_name": "mupe",
216
- "path": "/raid/datasets/MUPE/dataset/mupe/",
217
- "meta_file_train": "metadata_coqui_brgo.csv",
218
- "ignored_speakers": null,
219
- "language": "brgo",
220
- "phonemizer": "",
221
- "meta_file_val": "",
222
- "meta_file_attn_mask": ""
223
- },
224
- {
225
- "formatter": "coqui",
226
- "dataset_name": "mupe",
227
- "path": "/raid/datasets/MUPE/dataset/mupe/",
228
- "meta_file_train": "metadata_coqui_bral.csv",
229
- "ignored_speakers": null,
230
- "language": "bral",
231
- "phonemizer": "",
232
- "meta_file_val": "",
233
- "meta_file_attn_mask": ""
234
- },
235
- {
236
- "formatter": "coqui",
237
- "dataset_name": "mupe",
238
- "path": "/raid/datasets/MUPE/dataset/mupe/",
239
- "meta_file_train": "metadata_coqui_brpr.csv",
240
- "ignored_speakers": null,
241
- "language": "brpr",
242
- "phonemizer": "",
243
- "meta_file_val": "",
244
- "meta_file_attn_mask": ""
245
- }
246
- ],
247
- "test_sentences": [
248
- [
249
- "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
250
- "EDILEINE_FONSECA",
251
- null,
252
- "brsp"
253
- ],
254
- [
255
- "Quem semeia ventos, colhe tempestades.",
256
- "JOSE_PAULO_DE_ARAUJO",
257
- null,
258
- "brpb"
259
- ],
260
- [
261
- "O olho do dono \u00e9 que engorda o gado.",
262
- "VITOR_RAFAEL_OLIVEIRA_ALVES",
263
- null,
264
- "brba"
265
- ],
266
- [
267
- "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
268
- "MARIA_AURORA_FELIX",
269
- null,
270
- "brportugal"
271
- ],
272
- [
273
- "Quem espera sempre alcan\u00e7a.",
274
- "ANTONIO_DE_AMORIM_COSTA",
275
- null,
276
- "brpe"
277
- ],
278
- [
279
- "Cada macaco no seu galho.",
280
- "ALCIDES_DE_LIMA",
281
- null,
282
- "brmg"
283
- ],
284
- [
285
- "Em terra de cego, quem tem um olho \u00e9 rei.",
286
- "ALUISIO_SOARES_DE_SOUSA",
287
- null,
288
- "brrj"
289
- ],
290
- [
291
- "A ocasi\u00e3o faz o ladr\u00e3o.",
292
- "FRANCISCO_JOSE_MOREIRA_MOTA",
293
- null,
294
- "brce"
295
- ],
296
- [
297
- "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
298
- "EVALDO_ANDRADA_CORREA",
299
- null,
300
- "brrs"
301
- ],
302
- [
303
- "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
304
- "DORIS_ALEXANDER",
305
- null,
306
- "bralemanha"
307
- ],
308
- [
309
- "Quem n\u00e3o arrisca, n\u00e3o petisca.",
310
- "DONALDO_LUIZ_DE_ALMEIDA",
311
- null,
312
- "brgo"
313
- ],
314
- [
315
- "A uni\u00e3o faz a for\u00e7a.",
316
- "GERONCIO_HENRIQUE_NETO",
317
- null,
318
- "bral"
319
- ],
320
- [
321
- "Em boca fechada n\u00e3o entra mosquito.",
322
- "MALU_NATEL_FREIRE_WEBER",
323
- null,
324
- "brpr"
325
- ]
326
- ],
327
- "eval_split_max_size": 256,
328
- "eval_split_size": 0.01,
329
- "use_speaker_weighted_sampler": false,
330
- "speaker_weighted_sampler_alpha": 1.0,
331
- "use_language_weighted_sampler": false,
332
- "language_weighted_sampler_alpha": 1.0,
333
- "use_length_weighted_sampler": false,
334
- "length_weighted_sampler_alpha": 1.0,
335
- "model_args": {
336
- "num_chars": 266,
337
- "out_channels": 513,
338
- "spec_segment_size": 62,
339
- "hidden_channels": 192,
340
- "use_adaptive_weight_text_encoder": false,
341
- "use_perfect_class_batch_sampler": true,
342
- "perfect_class_batch_sampler_key": "language",
343
- "hidden_channels_ffn_text_encoder": 768,
344
- "num_heads_text_encoder": 2,
345
- "num_layers_text_encoder": 10,
346
- "kernel_size_text_encoder": 3,
347
- "dropout_p_text_encoder": 0.1,
348
- "dropout_p_duration_predictor": 0.5,
349
- "kernel_size_posterior_encoder": 5,
350
- "dilation_rate_posterior_encoder": 1,
351
- "num_layers_posterior_encoder": 16,
352
- "kernel_size_flow": 5,
353
- "dilation_rate_flow": 1,
354
- "num_layers_flow": 4,
355
- "resblock_type_decoder": "2",
356
- "resblock_kernel_sizes_decoder": [
357
- 3,
358
- 7,
359
- 11
360
- ],
361
- "resblock_dilation_sizes_decoder": [
362
- [
363
- 1,
364
- 3,
365
- 5
366
- ],
367
- [
368
- 1,
369
- 3,
370
- 5
371
- ],
372
- [
373
- 1,
374
- 3,
375
- 5
376
- ]
377
- ],
378
- "upsample_rates_decoder": [
379
- 8,
380
- 8,
381
- 2,
382
- 2
383
- ],
384
- "upsample_initial_channel_decoder": 512,
385
- "upsample_kernel_sizes_decoder": [
386
- 16,
387
- 16,
388
- 4,
389
- 4
390
- ],
391
- "periods_multi_period_discriminator": [
392
- 2,
393
- 3,
394
- 5,
395
- 7,
396
- 11
397
- ],
398
- "use_sdp": true,
399
- "noise_scale": 1.0,
400
- "inference_noise_scale": 0.667,
401
- "length_scale": 1,
402
- "noise_scale_dp": 1.0,
403
- "inference_noise_scale_dp": 1.0,
404
- "max_inference_len": null,
405
- "init_discriminator": true,
406
- "use_spectral_norm_disriminator": false,
407
- "use_speaker_embedding": false,
408
- "num_speakers": 0,
409
- "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
410
- "d_vector_file": [
411
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
412
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
413
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
414
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
415
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
416
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
417
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
418
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
419
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
420
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
421
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
422
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
423
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
424
- ],
425
- "speaker_embedding_channels": 256,
426
- "use_d_vector_file": true,
427
- "d_vector_dim": 512,
428
- "detach_dp_input": true,
429
- "use_language_embedding": true,
430
- "embedded_language_dim": 4,
431
- "num_languages": 0,
432
- "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
433
- "use_speaker_encoder_as_loss": false,
434
- "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
435
- "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
436
- "condition_dp_on_speaker": true,
437
- "freeze_encoder": false,
438
- "freeze_DP": false,
439
- "freeze_PE": false,
440
- "freeze_flow_decoder": false,
441
- "freeze_waveform_decoder": false,
442
- "encoder_sample_rate": null,
443
- "interpolate_z": true,
444
- "reinit_DP": false,
445
- "reinit_text_encoder": false
446
- },
447
- "lr_gen": 0.0002,
448
- "lr_disc": 0.0002,
449
- "lr_scheduler_gen": "ExponentialLR",
450
- "lr_scheduler_gen_params": {
451
- "gamma": 0.999875,
452
- "last_epoch": -1
453
- },
454
- "lr_scheduler_disc": "ExponentialLR",
455
- "lr_scheduler_disc_params": {
456
- "gamma": 0.999875,
457
- "last_epoch": -1
458
- },
459
- "kl_loss_alpha": 1.0,
460
- "disc_loss_alpha": 1.0,
461
- "gen_loss_alpha": 1.0,
462
- "feat_loss_alpha": 1.0,
463
- "mel_loss_alpha": 45.0,
464
- "dur_loss_alpha": 1.0,
465
- "speaker_encoder_loss_alpha": 9.0,
466
- "return_wav": true,
467
- "use_weighted_sampler": true,
468
- "weighted_sampler_attrs": {
469
- "language": 1.0
470
- },
471
- "weighted_sampler_multipliers": {},
472
- "r": 1,
473
- "num_speakers": 0,
474
- "use_speaker_embedding": false,
475
- "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
476
- "speaker_embedding_channels": 256,
477
- "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
478
- "use_language_embedding": true,
479
- "use_d_vector_file": true,
480
- "d_vector_file": [
481
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
482
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
483
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
484
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
485
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
486
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
487
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
488
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
489
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
490
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
491
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
492
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
493
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
494
- ],
495
- "d_vector_dim": 512
496
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "bral": 0,
3
- "bralemanha": 1,
4
- "brba": 2,
5
- "brce": 3,
6
- "brgo": 4,
7
- "brmg": 5,
8
- "brpb": 6,
9
- "brpe": 7,
10
- "brportugal": 8,
11
- "brpr": 9,
12
- "brrj": 10,
13
- "brrs": 11,
14
- "brsp": 12
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
3
- size 3296
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py DELETED
@@ -1,352 +0,0 @@
1
- import os
2
-
3
- import torch
4
- from trainer import Trainer, TrainerArgs
5
-
6
- from TTS.bin.compute_embeddings import compute_embeddings
7
- from TTS.bin.resample import resample_files
8
- from TTS.config.shared_configs import BaseDatasetConfig
9
- from TTS.tts.configs.vits_config import VitsConfig
10
- from TTS.tts.datasets import load_tts_samples
11
- from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
12
- from TTS.utils.downloaders import download_libri_tts
13
- from torch.utils.data import DataLoader
14
- from TTS.utils.samplers import PerfectBatchSampler
15
- torch.set_num_threads(24)
16
-
17
- # pylint: disable=W0105
18
- """
19
- This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
20
- YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
21
- """
22
- CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
23
-
24
- # Name of the run for the Trainer
25
- RUN_NAME = "YourTTS-Baseline-PT"
26
-
27
- # Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
28
- OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
-
30
- # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
- RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
-
33
- # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
- SKIP_TRAIN_EPOCH = False
35
-
36
- # Set here the batch size to be used in training and evaluation
37
- BATCH_SIZE = 26
38
-
39
- # Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
40
- # Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
41
- SAMPLE_RATE = 16000
42
-
43
-
44
- DASHBOARD_LOGGER="tensorboard"
45
- LOGGER_URI = None
46
-
47
- DASHBOARD_LOGGER = "clearml"
48
- LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
49
-
50
-
51
-
52
- # Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
53
- MAX_AUDIO_LEN_IN_SECONDS = float("inf")
54
-
55
- # Define here the datasets config
56
- brpb_train_config = BaseDatasetConfig(
57
- formatter="coqui",
58
- dataset_name="mupe",
59
- meta_file_train="metadata_coqui_brpb.csv",
60
- path="/raid/datasets/MUPE/dataset/mupe/",
61
- language="brpb"
62
- )
63
-
64
- brba_train_config = BaseDatasetConfig(
65
- formatter="coqui",
66
- dataset_name="mupe",
67
- meta_file_train="metadata_coqui_brba.csv",
68
- path="/raid/datasets/MUPE/dataset/mupe/",
69
- language="brba"
70
- )
71
-
72
- brportugal_train_config = BaseDatasetConfig(
73
- formatter="coqui",
74
- dataset_name="mupe",
75
- meta_file_train="metadata_coqui_brportugal.csv",
76
- path="/raid/datasets/MUPE/dataset/mupe/",
77
- language="brportugal"
78
- )
79
-
80
- brsp_train_config = BaseDatasetConfig(
81
- formatter="coqui",
82
- dataset_name="mupe",
83
- meta_file_train="metadata_coqui_brsp.csv",
84
- path="/raid/datasets/MUPE/dataset/mupe/",
85
- language="brsp"
86
- )
87
-
88
- brpe_train_config = BaseDatasetConfig(
89
- formatter="coqui",
90
- dataset_name="mupe",
91
- meta_file_train="metadata_coqui_brpe.csv",
92
- path="/raid/datasets/MUPE/dataset/mupe/",
93
- language="brpe"
94
- )
95
-
96
- brmg_train_config = BaseDatasetConfig(
97
- formatter="coqui",
98
- dataset_name="mupe",
99
- meta_file_train="metadata_coqui_brmg.csv",
100
- path="/raid/datasets/MUPE/dataset/mupe/",
101
- language="brmg"
102
- )
103
-
104
- brrj_train_config = BaseDatasetConfig(
105
- formatter="coqui",
106
- dataset_name="mupe",
107
- meta_file_train="metadata_coqui_brrj.csv",
108
- path="/raid/datasets/MUPE/dataset/mupe/",
109
- language="brrj"
110
- )
111
-
112
- brce_train_config = BaseDatasetConfig(
113
- formatter="coqui",
114
- dataset_name="mupe",
115
- meta_file_train="metadata_coqui_brce.csv",
116
- path="/raid/datasets/MUPE/dataset/mupe/",
117
- language="brce"
118
- )
119
-
120
- brrs_train_config = BaseDatasetConfig(
121
- formatter="coqui",
122
- dataset_name="mupe",
123
- meta_file_train="metadata_coqui_brrs.csv",
124
- path="/raid/datasets/MUPE/dataset/mupe/",
125
- language="brrs"
126
- )
127
-
128
- bralemanha_train_config = BaseDatasetConfig(
129
- formatter="coqui",
130
- dataset_name="mupe",
131
- meta_file_train="metadata_coqui_bralemanha.csv",
132
- path="/raid/datasets/MUPE/dataset/mupe/",
133
- language="bralemanha"
134
- )
135
-
136
- brgo_train_config = BaseDatasetConfig(
137
- formatter="coqui",
138
- dataset_name="mupe",
139
- meta_file_train="metadata_coqui_brgo.csv",
140
- path="/raid/datasets/MUPE/dataset/mupe/",
141
- language="brgo"
142
- )
143
-
144
- bral_train_config = BaseDatasetConfig(
145
- formatter="coqui",
146
- dataset_name="mupe",
147
- meta_file_train="metadata_coqui_bral.csv",
148
- path="/raid/datasets/MUPE/dataset/mupe/",
149
- language="bral"
150
- )
151
-
152
- brpr_train_config = BaseDatasetConfig(
153
- formatter="coqui",
154
- dataset_name="mupe",
155
- meta_file_train="metadata_coqui_brpr.csv",
156
- path="/raid/datasets/MUPE/dataset/mupe/",
157
- language="brpr"
158
- )
159
-
160
- bres_train_config = BaseDatasetConfig(
161
- formatter="coqui",
162
- dataset_name="mupe",
163
- meta_file_train="metadata_coqui_bres.csv",
164
- path="/raid/datasets/MUPE/dataset/mupe/",
165
- language="bres"
166
- )
167
-
168
- brpi_train_config = BaseDatasetConfig(
169
- formatter="coqui",
170
- dataset_name="mupe",
171
- meta_file_train="metadata_coqui_brpi.csv",
172
- path="/raid/datasets/MUPE/dataset/mupe/",
173
- language="brpi"
174
- )
175
-
176
- # bres_train_config, brpi_train_config no files found
177
- DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
178
-
179
-
180
- ### Extract speaker embeddings
181
- SPEAKER_ENCODER_CHECKPOINT_PATH = (
182
- "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
183
- )
184
- SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
185
-
186
- D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
187
-
188
- # Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
189
- for dataset_conf in DATASETS_CONFIG_LIST:
190
- # Check if the embeddings weren't already computed, if not compute it
191
- embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
192
- if not os.path.isfile(embeddings_file):
193
- print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
194
- compute_embeddings(
195
- SPEAKER_ENCODER_CHECKPOINT_PATH,
196
- SPEAKER_ENCODER_CONFIG_PATH,
197
- embeddings_file,
198
- old_speakers_file=None,
199
- config_dataset_path=None,
200
- formatter_name=dataset_conf.formatter,
201
- dataset_name=dataset_conf.dataset_name,
202
- dataset_path=dataset_conf.path,
203
- meta_file_train=dataset_conf.meta_file_train,
204
- meta_file_val=dataset_conf.meta_file_val,
205
- disable_cuda=False,
206
- no_eval=False,
207
- )
208
- D_VECTOR_FILES.append(embeddings_file)
209
-
210
-
211
- # Audio config used in training.
212
- audio_config = VitsAudioConfig(
213
- sample_rate=SAMPLE_RATE,
214
- hop_length=256,
215
- win_length=1024,
216
- fft_size=1024,
217
- mel_fmin=0.0,
218
- mel_fmax=None,
219
- num_mels=80,
220
- )
221
-
222
- # Init VITSArgs setting the arguments that are needed for the YourTTS model
223
- model_args = VitsArgs(
224
- spec_segment_size=62,
225
- hidden_channels=192,
226
- hidden_channels_ffn_text_encoder=768,
227
- num_heads_text_encoder=2,
228
- num_layers_text_encoder=10,
229
- kernel_size_text_encoder=3,
230
- dropout_p_text_encoder=0.1,
231
- d_vector_file=D_VECTOR_FILES,
232
- use_d_vector_file=True,
233
- d_vector_dim=512,
234
- speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
235
- speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
236
- resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
237
- # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
238
- use_speaker_encoder_as_loss=False,
239
- # Useful parameters to enable multilingual training
240
- use_language_embedding=True,
241
- embedded_language_dim=4,
242
- use_adaptive_weight_text_encoder=False,
243
- use_perfect_class_batch_sampler=True,
244
- perfect_class_batch_sampler_key="language"
245
- )
246
-
247
- # General training config, here you can change the batch size and others useful parameters
248
- config = VitsConfig(
249
- output_path=OUT_PATH,
250
- model_args=model_args,
251
- run_name=RUN_NAME,
252
- project_name="SYNTACC",
253
- run_description="""
254
- - YourTTS with SYNTACC text encoder
255
- """,
256
- dashboard_logger=DASHBOARD_LOGGER,
257
- logger_uri=LOGGER_URI,
258
- audio=audio_config,
259
- batch_size=BATCH_SIZE,
260
- batch_group_size=48,
261
- eval_batch_size=BATCH_SIZE,
262
- num_loader_workers=8,
263
- eval_split_max_size=256,
264
- print_step=50,
265
- plot_step=100,
266
- log_model_step=1000,
267
- save_step=5000,
268
- save_n_checkpoints=2,
269
- save_checkpoints=True,
270
- # target_loss="loss_1",
271
- print_eval=False,
272
- use_phonemes=False,
273
- phonemizer="espeak",
274
- phoneme_language="en",
275
- compute_input_seq_cache=True,
276
- add_blank=True,
277
- text_cleaner="multilingual_cleaners",
278
- characters=CharactersConfig(
279
- characters_class="TTS.tts.models.vits.VitsCharacters",
280
- pad="_",
281
- eos="&",
282
- bos="*",
283
- blank=None,
284
- characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
285
- punctuations="\u2014!'(),-.:;?\u00bf ",
286
- phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
287
- is_unique=True,
288
- is_sorted=True,
289
- ),
290
- phoneme_cache_path=None,
291
- precompute_num_workers=12,
292
- start_by_longest=True,
293
- datasets=DATASETS_CONFIG_LIST,
294
- cudnn_benchmark=False,
295
- max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
296
- mixed_precision=False,
297
- test_sentences=[
298
- #GUSTAVO: apenas pessoas do treino
299
- ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
300
- ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
301
- ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
302
- ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
303
- ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
304
- ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
305
- ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
306
- ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
307
- ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
308
- ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
309
- ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
310
- ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
311
- ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
312
- # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
313
- # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
314
- ],
315
- # Enable the weighted sampler
316
- use_weighted_sampler=True,
317
- # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
318
- # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
319
- weighted_sampler_attrs={"language": 1.0},
320
- weighted_sampler_multipliers={
321
- # "speaker_name": {
322
- # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
323
- # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
324
- # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
325
- # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
326
- # }
327
- },
328
- # It defines the Speaker Consistency Loss (SCL) Ξ± to 9 like the YourTTS paper
329
- speaker_encoder_loss_alpha=9.0,
330
- )
331
-
332
- # Load all the datasets samples and split traning and evaluation sets
333
- train_samples, eval_samples = load_tts_samples(
334
- config.datasets,
335
- eval_split=True,
336
- eval_split_max_size=config.eval_split_max_size,
337
- eval_split_size=config.eval_split_size,
338
- )
339
-
340
- # Init the model
341
- model = Vits.init_from_config(config)
342
-
343
- # Init the trainer and πŸš€
344
- trainer = Trainer(
345
- TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
346
- config,
347
- output_path=OUT_PATH,
348
- model=model,
349
- train_samples=train_samples,
350
- eval_samples=eval_samples,
351
- )
352
- trainer.fit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
3
- size 1043216142
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
3
- size 1043216142
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a584eb832a857f9a11180b34a84b81117d8690ed1e5fa39e4ff711cf6ffd7f7
3
- size 1043220766
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:367ac46477805942658a7a78e8cf473409537967f9382a46249a8d11521ed3f9
3
- size 1043220766
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json DELETED
@@ -1,496 +0,0 @@
1
- {
2
- "output_path": "/raid/datasets/MUPE/Experiments/runs",
3
- "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
4
- "run_name": "YourTTS-Baseline-PT",
5
- "project_name": "SYNTACC",
6
- "run_description": "\n - YourTTS with SYNTACC text encoder\n ",
7
- "print_step": 50,
8
- "plot_step": 100,
9
- "model_param_stats": false,
10
- "wandb_entity": null,
11
- "dashboard_logger": "clearml",
12
- "save_on_interrupt": true,
13
- "log_model_step": 1000,
14
- "save_step": 5000,
15
- "save_n_checkpoints": 2,
16
- "save_checkpoints": true,
17
- "save_all_best": false,
18
- "save_best_after": 10000,
19
- "target_loss": null,
20
- "print_eval": false,
21
- "test_delay_epochs": 0,
22
- "run_eval": true,
23
- "run_eval_steps": null,
24
- "distributed_backend": "nccl",
25
- "distributed_url": "tcp://localhost:54321",
26
- "mixed_precision": false,
27
- "precision": "fp16",
28
- "epochs": 1000,
29
- "batch_size": 26,
30
- "eval_batch_size": 26,
31
- "grad_clip": [
32
- 1000,
33
- 1000
34
- ],
35
- "scheduler_after_epoch": true,
36
- "lr": 0.001,
37
- "optimizer": "AdamW",
38
- "optimizer_params": {
39
- "betas": [
40
- 0.8,
41
- 0.99
42
- ],
43
- "eps": 1e-09,
44
- "weight_decay": 0.01
45
- },
46
- "lr_scheduler": null,
47
- "lr_scheduler_params": {},
48
- "use_grad_scaler": false,
49
- "allow_tf32": false,
50
- "cudnn_enable": true,
51
- "cudnn_deterministic": false,
52
- "cudnn_benchmark": false,
53
- "training_seed": 54321,
54
- "model": "vits",
55
- "num_loader_workers": 8,
56
- "num_eval_loader_workers": 0,
57
- "use_noise_augment": false,
58
- "audio": {
59
- "fft_size": 1024,
60
- "sample_rate": 16000,
61
- "win_length": 1024,
62
- "hop_length": 256,
63
- "num_mels": 80,
64
- "mel_fmin": 0.0,
65
- "mel_fmax": null
66
- },
67
- "use_phonemes": false,
68
- "phonemizer": "espeak",
69
- "phoneme_language": "en",
70
- "compute_input_seq_cache": true,
71
- "text_cleaner": "multilingual_cleaners",
72
- "enable_eos_bos_chars": false,
73
- "test_sentences_file": "",
74
- "phoneme_cache_path": null,
75
- "characters": {
76
- "characters_class": "TTS.tts.models.vits.VitsCharacters",
77
- "vocab_dict": null,
78
- "pad": "_",
79
- "eos": "&",
80
- "bos": "*",
81
- "blank": null,
82
- "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
83
- "punctuations": "\u2014!'(),-.:;?\u00bf ",
84
- "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
85
- "is_unique": true,
86
- "is_sorted": true
87
- },
88
- "add_blank": true,
89
- "batch_group_size": 48,
90
- "loss_masking": null,
91
- "min_audio_len": 1,
92
- "max_audio_len": Infinity,
93
- "min_text_len": 1,
94
- "max_text_len": Infinity,
95
- "compute_f0": false,
96
- "compute_energy": false,
97
- "compute_linear_spec": true,
98
- "precompute_num_workers": 12,
99
- "start_by_longest": true,
100
- "shuffle": false,
101
- "drop_last": false,
102
- "datasets": [
103
- {
104
- "formatter": "coqui",
105
- "dataset_name": "mupe",
106
- "path": "/raid/datasets/MUPE/dataset/mupe/",
107
- "meta_file_train": "metadata_coqui_brpb.csv",
108
- "ignored_speakers": null,
109
- "language": "brpb",
110
- "phonemizer": "",
111
- "meta_file_val": "",
112
- "meta_file_attn_mask": ""
113
- },
114
- {
115
- "formatter": "coqui",
116
- "dataset_name": "mupe",
117
- "path": "/raid/datasets/MUPE/dataset/mupe/",
118
- "meta_file_train": "metadata_coqui_brba.csv",
119
- "ignored_speakers": null,
120
- "language": "brba",
121
- "phonemizer": "",
122
- "meta_file_val": "",
123
- "meta_file_attn_mask": ""
124
- },
125
- {
126
- "formatter": "coqui",
127
- "dataset_name": "mupe",
128
- "path": "/raid/datasets/MUPE/dataset/mupe/",
129
- "meta_file_train": "metadata_coqui_brportugal.csv",
130
- "ignored_speakers": null,
131
- "language": "brportugal",
132
- "phonemizer": "",
133
- "meta_file_val": "",
134
- "meta_file_attn_mask": ""
135
- },
136
- {
137
- "formatter": "coqui",
138
- "dataset_name": "mupe",
139
- "path": "/raid/datasets/MUPE/dataset/mupe/",
140
- "meta_file_train": "metadata_coqui_brsp.csv",
141
- "ignored_speakers": null,
142
- "language": "brsp",
143
- "phonemizer": "",
144
- "meta_file_val": "",
145
- "meta_file_attn_mask": ""
146
- },
147
- {
148
- "formatter": "coqui",
149
- "dataset_name": "mupe",
150
- "path": "/raid/datasets/MUPE/dataset/mupe/",
151
- "meta_file_train": "metadata_coqui_brpe.csv",
152
- "ignored_speakers": null,
153
- "language": "brpe",
154
- "phonemizer": "",
155
- "meta_file_val": "",
156
- "meta_file_attn_mask": ""
157
- },
158
- {
159
- "formatter": "coqui",
160
- "dataset_name": "mupe",
161
- "path": "/raid/datasets/MUPE/dataset/mupe/",
162
- "meta_file_train": "metadata_coqui_brmg.csv",
163
- "ignored_speakers": null,
164
- "language": "brmg",
165
- "phonemizer": "",
166
- "meta_file_val": "",
167
- "meta_file_attn_mask": ""
168
- },
169
- {
170
- "formatter": "coqui",
171
- "dataset_name": "mupe",
172
- "path": "/raid/datasets/MUPE/dataset/mupe/",
173
- "meta_file_train": "metadata_coqui_brrj.csv",
174
- "ignored_speakers": null,
175
- "language": "brrj",
176
- "phonemizer": "",
177
- "meta_file_val": "",
178
- "meta_file_attn_mask": ""
179
- },
180
- {
181
- "formatter": "coqui",
182
- "dataset_name": "mupe",
183
- "path": "/raid/datasets/MUPE/dataset/mupe/",
184
- "meta_file_train": "metadata_coqui_brce.csv",
185
- "ignored_speakers": null,
186
- "language": "brce",
187
- "phonemizer": "",
188
- "meta_file_val": "",
189
- "meta_file_attn_mask": ""
190
- },
191
- {
192
- "formatter": "coqui",
193
- "dataset_name": "mupe",
194
- "path": "/raid/datasets/MUPE/dataset/mupe/",
195
- "meta_file_train": "metadata_coqui_brrs.csv",
196
- "ignored_speakers": null,
197
- "language": "brrs",
198
- "phonemizer": "",
199
- "meta_file_val": "",
200
- "meta_file_attn_mask": ""
201
- },
202
- {
203
- "formatter": "coqui",
204
- "dataset_name": "mupe",
205
- "path": "/raid/datasets/MUPE/dataset/mupe/",
206
- "meta_file_train": "metadata_coqui_bralemanha.csv",
207
- "ignored_speakers": null,
208
- "language": "bralemanha",
209
- "phonemizer": "",
210
- "meta_file_val": "",
211
- "meta_file_attn_mask": ""
212
- },
213
- {
214
- "formatter": "coqui",
215
- "dataset_name": "mupe",
216
- "path": "/raid/datasets/MUPE/dataset/mupe/",
217
- "meta_file_train": "metadata_coqui_brgo.csv",
218
- "ignored_speakers": null,
219
- "language": "brgo",
220
- "phonemizer": "",
221
- "meta_file_val": "",
222
- "meta_file_attn_mask": ""
223
- },
224
- {
225
- "formatter": "coqui",
226
- "dataset_name": "mupe",
227
- "path": "/raid/datasets/MUPE/dataset/mupe/",
228
- "meta_file_train": "metadata_coqui_bral.csv",
229
- "ignored_speakers": null,
230
- "language": "bral",
231
- "phonemizer": "",
232
- "meta_file_val": "",
233
- "meta_file_attn_mask": ""
234
- },
235
- {
236
- "formatter": "coqui",
237
- "dataset_name": "mupe",
238
- "path": "/raid/datasets/MUPE/dataset/mupe/",
239
- "meta_file_train": "metadata_coqui_brpr.csv",
240
- "ignored_speakers": null,
241
- "language": "brpr",
242
- "phonemizer": "",
243
- "meta_file_val": "",
244
- "meta_file_attn_mask": ""
245
- }
246
- ],
247
- "test_sentences": [
248
- [
249
- "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
250
- "EDILEINE_FONSECA",
251
- null,
252
- "brsp"
253
- ],
254
- [
255
- "Quem semeia ventos, colhe tempestades.",
256
- "JOSE_PAULO_DE_ARAUJO",
257
- null,
258
- "brpb"
259
- ],
260
- [
261
- "O olho do dono \u00e9 que engorda o gado.",
262
- "VITOR_RAFAEL_OLIVEIRA_ALVES",
263
- null,
264
- "brba"
265
- ],
266
- [
267
- "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
268
- "MARIA_AURORA_FELIX",
269
- null,
270
- "brportugal"
271
- ],
272
- [
273
- "Quem espera sempre alcan\u00e7a.",
274
- "ANTONIO_DE_AMORIM_COSTA",
275
- null,
276
- "brpe"
277
- ],
278
- [
279
- "Cada macaco no seu galho.",
280
- "ALCIDES_DE_LIMA",
281
- null,
282
- "brmg"
283
- ],
284
- [
285
- "Em terra de cego, quem tem um olho \u00e9 rei.",
286
- "ALUISIO_SOARES_DE_SOUSA",
287
- null,
288
- "brrj"
289
- ],
290
- [
291
- "A ocasi\u00e3o faz o ladr\u00e3o.",
292
- "FRANCISCO_JOSE_MOREIRA_MOTA",
293
- null,
294
- "brce"
295
- ],
296
- [
297
- "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
298
- "EVALDO_ANDRADA_CORREA",
299
- null,
300
- "brrs"
301
- ],
302
- [
303
- "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
304
- "DORIS_ALEXANDER",
305
- null,
306
- "bralemanha"
307
- ],
308
- [
309
- "Quem n\u00e3o arrisca, n\u00e3o petisca.",
310
- "DONALDO_LUIZ_DE_ALMEIDA",
311
- null,
312
- "brgo"
313
- ],
314
- [
315
- "A uni\u00e3o faz a for\u00e7a.",
316
- "GERONCIO_HENRIQUE_NETO",
317
- null,
318
- "bral"
319
- ],
320
- [
321
- "Em boca fechada n\u00e3o entra mosquito.",
322
- "MALU_NATEL_FREIRE_WEBER",
323
- null,
324
- "brpr"
325
- ]
326
- ],
327
- "eval_split_max_size": 256,
328
- "eval_split_size": 0.01,
329
- "use_speaker_weighted_sampler": false,
330
- "speaker_weighted_sampler_alpha": 1.0,
331
- "use_language_weighted_sampler": false,
332
- "language_weighted_sampler_alpha": 1.0,
333
- "use_length_weighted_sampler": false,
334
- "length_weighted_sampler_alpha": 1.0,
335
- "model_args": {
336
- "num_chars": 266,
337
- "out_channels": 513,
338
- "spec_segment_size": 62,
339
- "hidden_channels": 192,
340
- "use_adaptive_weight_text_encoder": false,
341
- "use_perfect_class_batch_sampler": true,
342
- "perfect_class_batch_sampler_key": "language",
343
- "hidden_channels_ffn_text_encoder": 768,
344
- "num_heads_text_encoder": 2,
345
- "num_layers_text_encoder": 10,
346
- "kernel_size_text_encoder": 3,
347
- "dropout_p_text_encoder": 0.1,
348
- "dropout_p_duration_predictor": 0.5,
349
- "kernel_size_posterior_encoder": 5,
350
- "dilation_rate_posterior_encoder": 1,
351
- "num_layers_posterior_encoder": 16,
352
- "kernel_size_flow": 5,
353
- "dilation_rate_flow": 1,
354
- "num_layers_flow": 4,
355
- "resblock_type_decoder": "2",
356
- "resblock_kernel_sizes_decoder": [
357
- 3,
358
- 7,
359
- 11
360
- ],
361
- "resblock_dilation_sizes_decoder": [
362
- [
363
- 1,
364
- 3,
365
- 5
366
- ],
367
- [
368
- 1,
369
- 3,
370
- 5
371
- ],
372
- [
373
- 1,
374
- 3,
375
- 5
376
- ]
377
- ],
378
- "upsample_rates_decoder": [
379
- 8,
380
- 8,
381
- 2,
382
- 2
383
- ],
384
- "upsample_initial_channel_decoder": 512,
385
- "upsample_kernel_sizes_decoder": [
386
- 16,
387
- 16,
388
- 4,
389
- 4
390
- ],
391
- "periods_multi_period_discriminator": [
392
- 2,
393
- 3,
394
- 5,
395
- 7,
396
- 11
397
- ],
398
- "use_sdp": true,
399
- "noise_scale": 1.0,
400
- "inference_noise_scale": 0.667,
401
- "length_scale": 1,
402
- "noise_scale_dp": 1.0,
403
- "inference_noise_scale_dp": 1.0,
404
- "max_inference_len": null,
405
- "init_discriminator": true,
406
- "use_spectral_norm_disriminator": false,
407
- "use_speaker_embedding": false,
408
- "num_speakers": 0,
409
- "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
410
- "d_vector_file": [
411
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
412
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
413
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
414
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
415
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
416
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
417
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
418
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
419
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
420
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
421
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
422
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
423
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
424
- ],
425
- "speaker_embedding_channels": 256,
426
- "use_d_vector_file": true,
427
- "d_vector_dim": 512,
428
- "detach_dp_input": true,
429
- "use_language_embedding": true,
430
- "embedded_language_dim": 4,
431
- "num_languages": 0,
432
- "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
433
- "use_speaker_encoder_as_loss": false,
434
- "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
435
- "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
436
- "condition_dp_on_speaker": true,
437
- "freeze_encoder": false,
438
- "freeze_DP": false,
439
- "freeze_PE": false,
440
- "freeze_flow_decoder": false,
441
- "freeze_waveform_decoder": false,
442
- "encoder_sample_rate": null,
443
- "interpolate_z": true,
444
- "reinit_DP": false,
445
- "reinit_text_encoder": false
446
- },
447
- "lr_gen": 0.0002,
448
- "lr_disc": 0.0002,
449
- "lr_scheduler_gen": "ExponentialLR",
450
- "lr_scheduler_gen_params": {
451
- "gamma": 0.999875,
452
- "last_epoch": -1
453
- },
454
- "lr_scheduler_disc": "ExponentialLR",
455
- "lr_scheduler_disc_params": {
456
- "gamma": 0.999875,
457
- "last_epoch": -1
458
- },
459
- "kl_loss_alpha": 1.0,
460
- "disc_loss_alpha": 1.0,
461
- "gen_loss_alpha": 1.0,
462
- "feat_loss_alpha": 1.0,
463
- "mel_loss_alpha": 45.0,
464
- "dur_loss_alpha": 1.0,
465
- "speaker_encoder_loss_alpha": 9.0,
466
- "return_wav": true,
467
- "use_weighted_sampler": true,
468
- "weighted_sampler_attrs": {
469
- "language": 1.0
470
- },
471
- "weighted_sampler_multipliers": {},
472
- "r": 1,
473
- "num_speakers": 0,
474
- "use_speaker_embedding": false,
475
- "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
476
- "speaker_embedding_channels": 256,
477
- "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
478
- "use_language_embedding": true,
479
- "use_d_vector_file": true,
480
- "d_vector_file": [
481
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
482
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
483
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
484
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
485
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
486
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
487
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
488
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
489
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
490
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
491
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
492
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
493
- "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
494
- ],
495
- "d_vector_dim": 512
496
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "bral": 0,
3
- "bralemanha": 1,
4
- "brba": 2,
5
- "brce": 3,
6
- "brgo": 4,
7
- "brmg": 5,
8
- "brpb": 6,
9
- "brpe": 7,
10
- "brportugal": 8,
11
- "brpr": 9,
12
- "brrj": 10,
13
- "brrs": 11,
14
- "brsp": 12
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
3
- size 3296
 
 
 
 
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ddf81cb4061c7e47bd824c3ebb109cc02bc31ab79ee21e4e69d60d32aca454b
3
- size 1794644
 
 
 
 
Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/{checkpoint_185000.pth β†’ checkpoint_195000.pth} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00fabae247abd9845b02ea35b314b4aab9714d3a2a63948b160c115008dc96da
3
  size 1044066458
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c552bdeff67502deab77d3f587269e090fac00dc991bcfba8dedfa21594d471
3
  size 1044066458
Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/trainer_0_log.txt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:433a8e8d660ca8de05748c6b575c29657381d5c2c504b073249e9f2cb833c25f
3
- size 3244264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:327601981f984533599c289f977acc81f9d7479999f14235302e6ad1a171d710
3
+ size 3401880
Experiments/train_syntacc_baseline.py CHANGED
@@ -28,7 +28,7 @@ RUN_NAME = "YourTTS-Baseline-PT"
28
  OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
 
30
  # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
- RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
 
33
  # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
  SKIP_TRAIN_EPOCH = False
 
28
  OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
 
30
  # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
+ RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/checkpoint_195000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
 
33
  # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
  SKIP_TRAIN_EPOCH = False