Update
Browse files- Experiments/nohup.out +2 -2
- Experiments/run/events.out.tfevents.1706462806.edresson-train-80.145564.0 +2 -2
- Experiments/{runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt β run/events.out.tfevents.1706899297.edresson-train-80-3.1052.0} +2 -2
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/best_model.pth +2 -2
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/best_model_195001.pth} +2 -2
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/config.json +6 -6
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/language_ids.json +0 -0
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/speakers.pth +0 -0
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/train_syntacc_baseline.py +3 -1
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/trainer_0_log.txt +2 -2
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py +0 -352
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json +0 -496
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json +0 -15
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py +0 -352
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json +0 -496
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json +0 -15
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt +0 -3
- Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/{checkpoint_185000.pth β checkpoint_195000.pth} +1 -1
- Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/trainer_0_log.txt +2 -2
- Experiments/train_syntacc_baseline.py +1 -1
Experiments/nohup.out
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4f3ff491be1a22770ad6be06a4ab637e3ee1fdd7ab56a46d56b6ee5ce294191a
|
3 |
+
size 19098782
|
Experiments/run/events.out.tfevents.1706462806.edresson-train-80.145564.0
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:edf473f639006f00be06083dcda982e19ad249445299bba3ccfa9d3c3be668c9
|
3 |
+
size 603478571
|
Experiments/{runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt β run/events.out.tfevents.1706899297.edresson-train-80-3.1052.0}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f92cb9921885f7784782d7c4cf4983bd9ebf92511857b363ad6c4a213d77e7fb
|
3 |
+
size 1426573
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/best_model.pth
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a8ca0385eb8c2d74471a308ead9447f46334969a793ff980a527783b55f6571
|
3 |
+
size 347720178
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/best_model_195001.pth}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a8ca0385eb8c2d74471a308ead9447f46334969a793ff980a527783b55f6571
|
3 |
+
size 347720178
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/config.json
RENAMED
@@ -397,16 +397,16 @@
|
|
397 |
],
|
398 |
"use_sdp": true,
|
399 |
"noise_scale": 1.0,
|
400 |
-
"inference_noise_scale": 0.
|
401 |
"length_scale": 1,
|
402 |
"noise_scale_dp": 1.0,
|
403 |
-
"inference_noise_scale_dp":
|
404 |
"max_inference_len": null,
|
405 |
"init_discriminator": true,
|
406 |
"use_spectral_norm_disriminator": false,
|
407 |
"use_speaker_embedding": false,
|
408 |
"num_speakers": 0,
|
409 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-
|
410 |
"d_vector_file": [
|
411 |
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
412 |
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
@@ -429,7 +429,7 @@
|
|
429 |
"use_language_embedding": true,
|
430 |
"embedded_language_dim": 4,
|
431 |
"num_languages": 0,
|
432 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-
|
433 |
"use_speaker_encoder_as_loss": false,
|
434 |
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
435 |
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
@@ -472,9 +472,9 @@
|
|
472 |
"r": 1,
|
473 |
"num_speakers": 0,
|
474 |
"use_speaker_embedding": false,
|
475 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-
|
476 |
"speaker_embedding_channels": 256,
|
477 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-
|
478 |
"use_language_embedding": true,
|
479 |
"use_d_vector_file": true,
|
480 |
"d_vector_file": [
|
|
|
397 |
],
|
398 |
"use_sdp": true,
|
399 |
"noise_scale": 1.0,
|
400 |
+
"inference_noise_scale": 0.33,
|
401 |
"length_scale": 1,
|
402 |
"noise_scale_dp": 1.0,
|
403 |
+
"inference_noise_scale_dp": 0.33,
|
404 |
"max_inference_len": null,
|
405 |
"init_discriminator": true,
|
406 |
"use_spectral_norm_disriminator": false,
|
407 |
"use_speaker_embedding": false,
|
408 |
"num_speakers": 0,
|
409 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/speakers.pth",
|
410 |
"d_vector_file": [
|
411 |
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
412 |
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
|
|
429 |
"use_language_embedding": true,
|
430 |
"embedded_language_dim": 4,
|
431 |
"num_languages": 0,
|
432 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/language_ids.json",
|
433 |
"use_speaker_encoder_as_loss": false,
|
434 |
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
435 |
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
|
|
472 |
"r": 1,
|
473 |
"num_speakers": 0,
|
474 |
"use_speaker_embedding": false,
|
475 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/speakers.pth",
|
476 |
"speaker_embedding_channels": 256,
|
477 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/language_ids.json",
|
478 |
"use_language_embedding": true,
|
479 |
"use_d_vector_file": true,
|
480 |
"d_vector_file": [
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/language_ids.json
RENAMED
File without changes
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/speakers.pth
RENAMED
File without changes
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/train_syntacc_baseline.py
RENAMED
@@ -28,7 +28,7 @@ RUN_NAME = "YourTTS-Baseline-PT"
|
|
28 |
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
|
30 |
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
-
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-
|
32 |
|
33 |
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
SKIP_TRAIN_EPOCH = False
|
@@ -221,6 +221,8 @@ audio_config = VitsAudioConfig(
|
|
221 |
|
222 |
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
223 |
model_args = VitsArgs(
|
|
|
|
|
224 |
spec_segment_size=62,
|
225 |
hidden_channels=192,
|
226 |
hidden_channels_ffn_text_encoder=768,
|
|
|
28 |
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
|
30 |
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
+
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/checkpoint_195000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
32 |
|
33 |
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
SKIP_TRAIN_EPOCH = False
|
|
|
221 |
|
222 |
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
223 |
model_args = VitsArgs(
|
224 |
+
inference_noise_scale=0.33,
|
225 |
+
inference_noise_scale_dp=0.33,
|
226 |
spec_segment_size=62,
|
227 |
hidden_channels=192,
|
228 |
hidden_channels_ffn_text_encoder=768,
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/trainer_0_log.txt
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:800fa1ba79843ee3494b41dbc8ffa45c6f147a7eb369e72260cbc0a5ce75dd72
|
3 |
+
size 135592
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
|
3 |
-
size 1043220702
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
|
3 |
-
size 1043220702
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a71ead47e605fc525b264ad882fd54630c15a42eb69aaf88993d26d5ea84ae3b
|
3 |
-
size 1043220766
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:96e16ee83729813041c17f6edf8a702bdf59e7afe345cfad1fe65dd4ba0b1fce
|
3 |
-
size 1043220766
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py
DELETED
@@ -1,352 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
import torch
|
4 |
-
from trainer import Trainer, TrainerArgs
|
5 |
-
|
6 |
-
from TTS.bin.compute_embeddings import compute_embeddings
|
7 |
-
from TTS.bin.resample import resample_files
|
8 |
-
from TTS.config.shared_configs import BaseDatasetConfig
|
9 |
-
from TTS.tts.configs.vits_config import VitsConfig
|
10 |
-
from TTS.tts.datasets import load_tts_samples
|
11 |
-
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
|
12 |
-
from TTS.utils.downloaders import download_libri_tts
|
13 |
-
from torch.utils.data import DataLoader
|
14 |
-
from TTS.utils.samplers import PerfectBatchSampler
|
15 |
-
torch.set_num_threads(24)
|
16 |
-
|
17 |
-
# pylint: disable=W0105
|
18 |
-
"""
|
19 |
-
This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
|
20 |
-
YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
|
21 |
-
"""
|
22 |
-
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
|
23 |
-
|
24 |
-
# Name of the run for the Trainer
|
25 |
-
RUN_NAME = "YourTTS-Baseline-PT"
|
26 |
-
|
27 |
-
# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
|
28 |
-
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
-
|
30 |
-
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
-
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
32 |
-
|
33 |
-
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
-
SKIP_TRAIN_EPOCH = False
|
35 |
-
|
36 |
-
# Set here the batch size to be used in training and evaluation
|
37 |
-
BATCH_SIZE = 26
|
38 |
-
|
39 |
-
# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
|
40 |
-
# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
|
41 |
-
SAMPLE_RATE = 16000
|
42 |
-
|
43 |
-
|
44 |
-
DASHBOARD_LOGGER="tensorboard"
|
45 |
-
LOGGER_URI = None
|
46 |
-
|
47 |
-
DASHBOARD_LOGGER = "clearml"
|
48 |
-
LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
|
53 |
-
MAX_AUDIO_LEN_IN_SECONDS = float("inf")
|
54 |
-
|
55 |
-
# Define here the datasets config
|
56 |
-
brpb_train_config = BaseDatasetConfig(
|
57 |
-
formatter="coqui",
|
58 |
-
dataset_name="mupe",
|
59 |
-
meta_file_train="metadata_coqui_brpb.csv",
|
60 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
61 |
-
language="brpb"
|
62 |
-
)
|
63 |
-
|
64 |
-
brba_train_config = BaseDatasetConfig(
|
65 |
-
formatter="coqui",
|
66 |
-
dataset_name="mupe",
|
67 |
-
meta_file_train="metadata_coqui_brba.csv",
|
68 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
69 |
-
language="brba"
|
70 |
-
)
|
71 |
-
|
72 |
-
brportugal_train_config = BaseDatasetConfig(
|
73 |
-
formatter="coqui",
|
74 |
-
dataset_name="mupe",
|
75 |
-
meta_file_train="metadata_coqui_brportugal.csv",
|
76 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
77 |
-
language="brportugal"
|
78 |
-
)
|
79 |
-
|
80 |
-
brsp_train_config = BaseDatasetConfig(
|
81 |
-
formatter="coqui",
|
82 |
-
dataset_name="mupe",
|
83 |
-
meta_file_train="metadata_coqui_brsp.csv",
|
84 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
85 |
-
language="brsp"
|
86 |
-
)
|
87 |
-
|
88 |
-
brpe_train_config = BaseDatasetConfig(
|
89 |
-
formatter="coqui",
|
90 |
-
dataset_name="mupe",
|
91 |
-
meta_file_train="metadata_coqui_brpe.csv",
|
92 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
93 |
-
language="brpe"
|
94 |
-
)
|
95 |
-
|
96 |
-
brmg_train_config = BaseDatasetConfig(
|
97 |
-
formatter="coqui",
|
98 |
-
dataset_name="mupe",
|
99 |
-
meta_file_train="metadata_coqui_brmg.csv",
|
100 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
101 |
-
language="brmg"
|
102 |
-
)
|
103 |
-
|
104 |
-
brrj_train_config = BaseDatasetConfig(
|
105 |
-
formatter="coqui",
|
106 |
-
dataset_name="mupe",
|
107 |
-
meta_file_train="metadata_coqui_brrj.csv",
|
108 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
109 |
-
language="brrj"
|
110 |
-
)
|
111 |
-
|
112 |
-
brce_train_config = BaseDatasetConfig(
|
113 |
-
formatter="coqui",
|
114 |
-
dataset_name="mupe",
|
115 |
-
meta_file_train="metadata_coqui_brce.csv",
|
116 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
117 |
-
language="brce"
|
118 |
-
)
|
119 |
-
|
120 |
-
brrs_train_config = BaseDatasetConfig(
|
121 |
-
formatter="coqui",
|
122 |
-
dataset_name="mupe",
|
123 |
-
meta_file_train="metadata_coqui_brrs.csv",
|
124 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
125 |
-
language="brrs"
|
126 |
-
)
|
127 |
-
|
128 |
-
bralemanha_train_config = BaseDatasetConfig(
|
129 |
-
formatter="coqui",
|
130 |
-
dataset_name="mupe",
|
131 |
-
meta_file_train="metadata_coqui_bralemanha.csv",
|
132 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
133 |
-
language="bralemanha"
|
134 |
-
)
|
135 |
-
|
136 |
-
brgo_train_config = BaseDatasetConfig(
|
137 |
-
formatter="coqui",
|
138 |
-
dataset_name="mupe",
|
139 |
-
meta_file_train="metadata_coqui_brgo.csv",
|
140 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
141 |
-
language="brgo"
|
142 |
-
)
|
143 |
-
|
144 |
-
bral_train_config = BaseDatasetConfig(
|
145 |
-
formatter="coqui",
|
146 |
-
dataset_name="mupe",
|
147 |
-
meta_file_train="metadata_coqui_bral.csv",
|
148 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
149 |
-
language="bral"
|
150 |
-
)
|
151 |
-
|
152 |
-
brpr_train_config = BaseDatasetConfig(
|
153 |
-
formatter="coqui",
|
154 |
-
dataset_name="mupe",
|
155 |
-
meta_file_train="metadata_coqui_brpr.csv",
|
156 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
157 |
-
language="brpr"
|
158 |
-
)
|
159 |
-
|
160 |
-
bres_train_config = BaseDatasetConfig(
|
161 |
-
formatter="coqui",
|
162 |
-
dataset_name="mupe",
|
163 |
-
meta_file_train="metadata_coqui_bres.csv",
|
164 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
165 |
-
language="bres"
|
166 |
-
)
|
167 |
-
|
168 |
-
brpi_train_config = BaseDatasetConfig(
|
169 |
-
formatter="coqui",
|
170 |
-
dataset_name="mupe",
|
171 |
-
meta_file_train="metadata_coqui_brpi.csv",
|
172 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
173 |
-
language="brpi"
|
174 |
-
)
|
175 |
-
|
176 |
-
# bres_train_config, brpi_train_config no files found
|
177 |
-
DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
|
178 |
-
|
179 |
-
|
180 |
-
### Extract speaker embeddings
|
181 |
-
SPEAKER_ENCODER_CHECKPOINT_PATH = (
|
182 |
-
"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
|
183 |
-
)
|
184 |
-
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
|
185 |
-
|
186 |
-
D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
|
187 |
-
|
188 |
-
# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
|
189 |
-
for dataset_conf in DATASETS_CONFIG_LIST:
|
190 |
-
# Check if the embeddings weren't already computed, if not compute it
|
191 |
-
embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
|
192 |
-
if not os.path.isfile(embeddings_file):
|
193 |
-
print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
|
194 |
-
compute_embeddings(
|
195 |
-
SPEAKER_ENCODER_CHECKPOINT_PATH,
|
196 |
-
SPEAKER_ENCODER_CONFIG_PATH,
|
197 |
-
embeddings_file,
|
198 |
-
old_speakers_file=None,
|
199 |
-
config_dataset_path=None,
|
200 |
-
formatter_name=dataset_conf.formatter,
|
201 |
-
dataset_name=dataset_conf.dataset_name,
|
202 |
-
dataset_path=dataset_conf.path,
|
203 |
-
meta_file_train=dataset_conf.meta_file_train,
|
204 |
-
meta_file_val=dataset_conf.meta_file_val,
|
205 |
-
disable_cuda=False,
|
206 |
-
no_eval=False,
|
207 |
-
)
|
208 |
-
D_VECTOR_FILES.append(embeddings_file)
|
209 |
-
|
210 |
-
|
211 |
-
# Audio config used in training.
|
212 |
-
audio_config = VitsAudioConfig(
|
213 |
-
sample_rate=SAMPLE_RATE,
|
214 |
-
hop_length=256,
|
215 |
-
win_length=1024,
|
216 |
-
fft_size=1024,
|
217 |
-
mel_fmin=0.0,
|
218 |
-
mel_fmax=None,
|
219 |
-
num_mels=80,
|
220 |
-
)
|
221 |
-
|
222 |
-
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
223 |
-
model_args = VitsArgs(
|
224 |
-
spec_segment_size=62,
|
225 |
-
hidden_channels=192,
|
226 |
-
hidden_channels_ffn_text_encoder=768,
|
227 |
-
num_heads_text_encoder=2,
|
228 |
-
num_layers_text_encoder=10,
|
229 |
-
kernel_size_text_encoder=3,
|
230 |
-
dropout_p_text_encoder=0.1,
|
231 |
-
d_vector_file=D_VECTOR_FILES,
|
232 |
-
use_d_vector_file=True,
|
233 |
-
d_vector_dim=512,
|
234 |
-
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
235 |
-
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
236 |
-
resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
237 |
-
# Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
|
238 |
-
use_speaker_encoder_as_loss=False,
|
239 |
-
# Useful parameters to enable multilingual training
|
240 |
-
use_language_embedding=True,
|
241 |
-
embedded_language_dim=4,
|
242 |
-
use_adaptive_weight_text_encoder=False,
|
243 |
-
use_perfect_class_batch_sampler=True,
|
244 |
-
perfect_class_batch_sampler_key="language"
|
245 |
-
)
|
246 |
-
|
247 |
-
# General training config, here you can change the batch size and others useful parameters
|
248 |
-
config = VitsConfig(
|
249 |
-
output_path=OUT_PATH,
|
250 |
-
model_args=model_args,
|
251 |
-
run_name=RUN_NAME,
|
252 |
-
project_name="SYNTACC",
|
253 |
-
run_description="""
|
254 |
-
- YourTTS with SYNTACC text encoder
|
255 |
-
""",
|
256 |
-
dashboard_logger=DASHBOARD_LOGGER,
|
257 |
-
logger_uri=LOGGER_URI,
|
258 |
-
audio=audio_config,
|
259 |
-
batch_size=BATCH_SIZE,
|
260 |
-
batch_group_size=48,
|
261 |
-
eval_batch_size=BATCH_SIZE,
|
262 |
-
num_loader_workers=8,
|
263 |
-
eval_split_max_size=256,
|
264 |
-
print_step=50,
|
265 |
-
plot_step=100,
|
266 |
-
log_model_step=1000,
|
267 |
-
save_step=5000,
|
268 |
-
save_n_checkpoints=2,
|
269 |
-
save_checkpoints=True,
|
270 |
-
# target_loss="loss_1",
|
271 |
-
print_eval=False,
|
272 |
-
use_phonemes=False,
|
273 |
-
phonemizer="espeak",
|
274 |
-
phoneme_language="en",
|
275 |
-
compute_input_seq_cache=True,
|
276 |
-
add_blank=True,
|
277 |
-
text_cleaner="multilingual_cleaners",
|
278 |
-
characters=CharactersConfig(
|
279 |
-
characters_class="TTS.tts.models.vits.VitsCharacters",
|
280 |
-
pad="_",
|
281 |
-
eos="&",
|
282 |
-
bos="*",
|
283 |
-
blank=None,
|
284 |
-
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
285 |
-
punctuations="\u2014!'(),-.:;?\u00bf ",
|
286 |
-
phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
287 |
-
is_unique=True,
|
288 |
-
is_sorted=True,
|
289 |
-
),
|
290 |
-
phoneme_cache_path=None,
|
291 |
-
precompute_num_workers=12,
|
292 |
-
start_by_longest=True,
|
293 |
-
datasets=DATASETS_CONFIG_LIST,
|
294 |
-
cudnn_benchmark=False,
|
295 |
-
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
|
296 |
-
mixed_precision=False,
|
297 |
-
test_sentences=[
|
298 |
-
#GUSTAVO: apenas pessoas do treino
|
299 |
-
["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
|
300 |
-
["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
|
301 |
-
["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
|
302 |
-
["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
|
303 |
-
["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
|
304 |
-
["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
|
305 |
-
["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
|
306 |
-
["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
|
307 |
-
["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
|
308 |
-
["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
|
309 |
-
["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
|
310 |
-
["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
|
311 |
-
["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
|
312 |
-
# ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
|
313 |
-
# ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
|
314 |
-
],
|
315 |
-
# Enable the weighted sampler
|
316 |
-
use_weighted_sampler=True,
|
317 |
-
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
318 |
-
# weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
|
319 |
-
weighted_sampler_attrs={"language": 1.0},
|
320 |
-
weighted_sampler_multipliers={
|
321 |
-
# "speaker_name": {
|
322 |
-
# you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
|
323 |
-
# It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
|
324 |
-
# The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
|
325 |
-
# 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
|
326 |
-
# }
|
327 |
-
},
|
328 |
-
# It defines the Speaker Consistency Loss (SCL) Ξ± to 9 like the YourTTS paper
|
329 |
-
speaker_encoder_loss_alpha=9.0,
|
330 |
-
)
|
331 |
-
|
332 |
-
# Load all the datasets samples and split traning and evaluation sets
|
333 |
-
train_samples, eval_samples = load_tts_samples(
|
334 |
-
config.datasets,
|
335 |
-
eval_split=True,
|
336 |
-
eval_split_max_size=config.eval_split_max_size,
|
337 |
-
eval_split_size=config.eval_split_size,
|
338 |
-
)
|
339 |
-
|
340 |
-
# Init the model
|
341 |
-
model = Vits.init_from_config(config)
|
342 |
-
|
343 |
-
# Init the trainer and π
|
344 |
-
trainer = Trainer(
|
345 |
-
TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
|
346 |
-
config,
|
347 |
-
output_path=OUT_PATH,
|
348 |
-
model=model,
|
349 |
-
train_samples=train_samples,
|
350 |
-
eval_samples=eval_samples,
|
351 |
-
)
|
352 |
-
trainer.fit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json
DELETED
@@ -1,496 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"output_path": "/raid/datasets/MUPE/Experiments/runs",
|
3 |
-
"logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
|
4 |
-
"run_name": "YourTTS-Baseline-PT",
|
5 |
-
"project_name": "SYNTACC",
|
6 |
-
"run_description": "\n - YourTTS with SYNTACC text encoder\n ",
|
7 |
-
"print_step": 50,
|
8 |
-
"plot_step": 100,
|
9 |
-
"model_param_stats": false,
|
10 |
-
"wandb_entity": null,
|
11 |
-
"dashboard_logger": "clearml",
|
12 |
-
"save_on_interrupt": true,
|
13 |
-
"log_model_step": 1000,
|
14 |
-
"save_step": 5000,
|
15 |
-
"save_n_checkpoints": 2,
|
16 |
-
"save_checkpoints": true,
|
17 |
-
"save_all_best": false,
|
18 |
-
"save_best_after": 10000,
|
19 |
-
"target_loss": null,
|
20 |
-
"print_eval": false,
|
21 |
-
"test_delay_epochs": 0,
|
22 |
-
"run_eval": true,
|
23 |
-
"run_eval_steps": null,
|
24 |
-
"distributed_backend": "nccl",
|
25 |
-
"distributed_url": "tcp://localhost:54321",
|
26 |
-
"mixed_precision": false,
|
27 |
-
"precision": "fp16",
|
28 |
-
"epochs": 1000,
|
29 |
-
"batch_size": 26,
|
30 |
-
"eval_batch_size": 26,
|
31 |
-
"grad_clip": [
|
32 |
-
1000,
|
33 |
-
1000
|
34 |
-
],
|
35 |
-
"scheduler_after_epoch": true,
|
36 |
-
"lr": 0.001,
|
37 |
-
"optimizer": "AdamW",
|
38 |
-
"optimizer_params": {
|
39 |
-
"betas": [
|
40 |
-
0.8,
|
41 |
-
0.99
|
42 |
-
],
|
43 |
-
"eps": 1e-09,
|
44 |
-
"weight_decay": 0.01
|
45 |
-
},
|
46 |
-
"lr_scheduler": null,
|
47 |
-
"lr_scheduler_params": {},
|
48 |
-
"use_grad_scaler": false,
|
49 |
-
"allow_tf32": false,
|
50 |
-
"cudnn_enable": true,
|
51 |
-
"cudnn_deterministic": false,
|
52 |
-
"cudnn_benchmark": false,
|
53 |
-
"training_seed": 54321,
|
54 |
-
"model": "vits",
|
55 |
-
"num_loader_workers": 8,
|
56 |
-
"num_eval_loader_workers": 0,
|
57 |
-
"use_noise_augment": false,
|
58 |
-
"audio": {
|
59 |
-
"fft_size": 1024,
|
60 |
-
"sample_rate": 16000,
|
61 |
-
"win_length": 1024,
|
62 |
-
"hop_length": 256,
|
63 |
-
"num_mels": 80,
|
64 |
-
"mel_fmin": 0.0,
|
65 |
-
"mel_fmax": null
|
66 |
-
},
|
67 |
-
"use_phonemes": false,
|
68 |
-
"phonemizer": "espeak",
|
69 |
-
"phoneme_language": "en",
|
70 |
-
"compute_input_seq_cache": true,
|
71 |
-
"text_cleaner": "multilingual_cleaners",
|
72 |
-
"enable_eos_bos_chars": false,
|
73 |
-
"test_sentences_file": "",
|
74 |
-
"phoneme_cache_path": null,
|
75 |
-
"characters": {
|
76 |
-
"characters_class": "TTS.tts.models.vits.VitsCharacters",
|
77 |
-
"vocab_dict": null,
|
78 |
-
"pad": "_",
|
79 |
-
"eos": "&",
|
80 |
-
"bos": "*",
|
81 |
-
"blank": null,
|
82 |
-
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
83 |
-
"punctuations": "\u2014!'(),-.:;?\u00bf ",
|
84 |
-
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
85 |
-
"is_unique": true,
|
86 |
-
"is_sorted": true
|
87 |
-
},
|
88 |
-
"add_blank": true,
|
89 |
-
"batch_group_size": 48,
|
90 |
-
"loss_masking": null,
|
91 |
-
"min_audio_len": 1,
|
92 |
-
"max_audio_len": Infinity,
|
93 |
-
"min_text_len": 1,
|
94 |
-
"max_text_len": Infinity,
|
95 |
-
"compute_f0": false,
|
96 |
-
"compute_energy": false,
|
97 |
-
"compute_linear_spec": true,
|
98 |
-
"precompute_num_workers": 12,
|
99 |
-
"start_by_longest": true,
|
100 |
-
"shuffle": false,
|
101 |
-
"drop_last": false,
|
102 |
-
"datasets": [
|
103 |
-
{
|
104 |
-
"formatter": "coqui",
|
105 |
-
"dataset_name": "mupe",
|
106 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
107 |
-
"meta_file_train": "metadata_coqui_brpb.csv",
|
108 |
-
"ignored_speakers": null,
|
109 |
-
"language": "brpb",
|
110 |
-
"phonemizer": "",
|
111 |
-
"meta_file_val": "",
|
112 |
-
"meta_file_attn_mask": ""
|
113 |
-
},
|
114 |
-
{
|
115 |
-
"formatter": "coqui",
|
116 |
-
"dataset_name": "mupe",
|
117 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
118 |
-
"meta_file_train": "metadata_coqui_brba.csv",
|
119 |
-
"ignored_speakers": null,
|
120 |
-
"language": "brba",
|
121 |
-
"phonemizer": "",
|
122 |
-
"meta_file_val": "",
|
123 |
-
"meta_file_attn_mask": ""
|
124 |
-
},
|
125 |
-
{
|
126 |
-
"formatter": "coqui",
|
127 |
-
"dataset_name": "mupe",
|
128 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
129 |
-
"meta_file_train": "metadata_coqui_brportugal.csv",
|
130 |
-
"ignored_speakers": null,
|
131 |
-
"language": "brportugal",
|
132 |
-
"phonemizer": "",
|
133 |
-
"meta_file_val": "",
|
134 |
-
"meta_file_attn_mask": ""
|
135 |
-
},
|
136 |
-
{
|
137 |
-
"formatter": "coqui",
|
138 |
-
"dataset_name": "mupe",
|
139 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
140 |
-
"meta_file_train": "metadata_coqui_brsp.csv",
|
141 |
-
"ignored_speakers": null,
|
142 |
-
"language": "brsp",
|
143 |
-
"phonemizer": "",
|
144 |
-
"meta_file_val": "",
|
145 |
-
"meta_file_attn_mask": ""
|
146 |
-
},
|
147 |
-
{
|
148 |
-
"formatter": "coqui",
|
149 |
-
"dataset_name": "mupe",
|
150 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
151 |
-
"meta_file_train": "metadata_coqui_brpe.csv",
|
152 |
-
"ignored_speakers": null,
|
153 |
-
"language": "brpe",
|
154 |
-
"phonemizer": "",
|
155 |
-
"meta_file_val": "",
|
156 |
-
"meta_file_attn_mask": ""
|
157 |
-
},
|
158 |
-
{
|
159 |
-
"formatter": "coqui",
|
160 |
-
"dataset_name": "mupe",
|
161 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
162 |
-
"meta_file_train": "metadata_coqui_brmg.csv",
|
163 |
-
"ignored_speakers": null,
|
164 |
-
"language": "brmg",
|
165 |
-
"phonemizer": "",
|
166 |
-
"meta_file_val": "",
|
167 |
-
"meta_file_attn_mask": ""
|
168 |
-
},
|
169 |
-
{
|
170 |
-
"formatter": "coqui",
|
171 |
-
"dataset_name": "mupe",
|
172 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
173 |
-
"meta_file_train": "metadata_coqui_brrj.csv",
|
174 |
-
"ignored_speakers": null,
|
175 |
-
"language": "brrj",
|
176 |
-
"phonemizer": "",
|
177 |
-
"meta_file_val": "",
|
178 |
-
"meta_file_attn_mask": ""
|
179 |
-
},
|
180 |
-
{
|
181 |
-
"formatter": "coqui",
|
182 |
-
"dataset_name": "mupe",
|
183 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
184 |
-
"meta_file_train": "metadata_coqui_brce.csv",
|
185 |
-
"ignored_speakers": null,
|
186 |
-
"language": "brce",
|
187 |
-
"phonemizer": "",
|
188 |
-
"meta_file_val": "",
|
189 |
-
"meta_file_attn_mask": ""
|
190 |
-
},
|
191 |
-
{
|
192 |
-
"formatter": "coqui",
|
193 |
-
"dataset_name": "mupe",
|
194 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
195 |
-
"meta_file_train": "metadata_coqui_brrs.csv",
|
196 |
-
"ignored_speakers": null,
|
197 |
-
"language": "brrs",
|
198 |
-
"phonemizer": "",
|
199 |
-
"meta_file_val": "",
|
200 |
-
"meta_file_attn_mask": ""
|
201 |
-
},
|
202 |
-
{
|
203 |
-
"formatter": "coqui",
|
204 |
-
"dataset_name": "mupe",
|
205 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
206 |
-
"meta_file_train": "metadata_coqui_bralemanha.csv",
|
207 |
-
"ignored_speakers": null,
|
208 |
-
"language": "bralemanha",
|
209 |
-
"phonemizer": "",
|
210 |
-
"meta_file_val": "",
|
211 |
-
"meta_file_attn_mask": ""
|
212 |
-
},
|
213 |
-
{
|
214 |
-
"formatter": "coqui",
|
215 |
-
"dataset_name": "mupe",
|
216 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
217 |
-
"meta_file_train": "metadata_coqui_brgo.csv",
|
218 |
-
"ignored_speakers": null,
|
219 |
-
"language": "brgo",
|
220 |
-
"phonemizer": "",
|
221 |
-
"meta_file_val": "",
|
222 |
-
"meta_file_attn_mask": ""
|
223 |
-
},
|
224 |
-
{
|
225 |
-
"formatter": "coqui",
|
226 |
-
"dataset_name": "mupe",
|
227 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
228 |
-
"meta_file_train": "metadata_coqui_bral.csv",
|
229 |
-
"ignored_speakers": null,
|
230 |
-
"language": "bral",
|
231 |
-
"phonemizer": "",
|
232 |
-
"meta_file_val": "",
|
233 |
-
"meta_file_attn_mask": ""
|
234 |
-
},
|
235 |
-
{
|
236 |
-
"formatter": "coqui",
|
237 |
-
"dataset_name": "mupe",
|
238 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
239 |
-
"meta_file_train": "metadata_coqui_brpr.csv",
|
240 |
-
"ignored_speakers": null,
|
241 |
-
"language": "brpr",
|
242 |
-
"phonemizer": "",
|
243 |
-
"meta_file_val": "",
|
244 |
-
"meta_file_attn_mask": ""
|
245 |
-
}
|
246 |
-
],
|
247 |
-
"test_sentences": [
|
248 |
-
[
|
249 |
-
"Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
|
250 |
-
"EDILEINE_FONSECA",
|
251 |
-
null,
|
252 |
-
"brsp"
|
253 |
-
],
|
254 |
-
[
|
255 |
-
"Quem semeia ventos, colhe tempestades.",
|
256 |
-
"JOSE_PAULO_DE_ARAUJO",
|
257 |
-
null,
|
258 |
-
"brpb"
|
259 |
-
],
|
260 |
-
[
|
261 |
-
"O olho do dono \u00e9 que engorda o gado.",
|
262 |
-
"VITOR_RAFAEL_OLIVEIRA_ALVES",
|
263 |
-
null,
|
264 |
-
"brba"
|
265 |
-
],
|
266 |
-
[
|
267 |
-
"\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
|
268 |
-
"MARIA_AURORA_FELIX",
|
269 |
-
null,
|
270 |
-
"brportugal"
|
271 |
-
],
|
272 |
-
[
|
273 |
-
"Quem espera sempre alcan\u00e7a.",
|
274 |
-
"ANTONIO_DE_AMORIM_COSTA",
|
275 |
-
null,
|
276 |
-
"brpe"
|
277 |
-
],
|
278 |
-
[
|
279 |
-
"Cada macaco no seu galho.",
|
280 |
-
"ALCIDES_DE_LIMA",
|
281 |
-
null,
|
282 |
-
"brmg"
|
283 |
-
],
|
284 |
-
[
|
285 |
-
"Em terra de cego, quem tem um olho \u00e9 rei.",
|
286 |
-
"ALUISIO_SOARES_DE_SOUSA",
|
287 |
-
null,
|
288 |
-
"brrj"
|
289 |
-
],
|
290 |
-
[
|
291 |
-
"A ocasi\u00e3o faz o ladr\u00e3o.",
|
292 |
-
"FRANCISCO_JOSE_MOREIRA_MOTA",
|
293 |
-
null,
|
294 |
-
"brce"
|
295 |
-
],
|
296 |
-
[
|
297 |
-
"De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
|
298 |
-
"EVALDO_ANDRADA_CORREA",
|
299 |
-
null,
|
300 |
-
"brrs"
|
301 |
-
],
|
302 |
-
[
|
303 |
-
"Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
|
304 |
-
"DORIS_ALEXANDER",
|
305 |
-
null,
|
306 |
-
"bralemanha"
|
307 |
-
],
|
308 |
-
[
|
309 |
-
"Quem n\u00e3o arrisca, n\u00e3o petisca.",
|
310 |
-
"DONALDO_LUIZ_DE_ALMEIDA",
|
311 |
-
null,
|
312 |
-
"brgo"
|
313 |
-
],
|
314 |
-
[
|
315 |
-
"A uni\u00e3o faz a for\u00e7a.",
|
316 |
-
"GERONCIO_HENRIQUE_NETO",
|
317 |
-
null,
|
318 |
-
"bral"
|
319 |
-
],
|
320 |
-
[
|
321 |
-
"Em boca fechada n\u00e3o entra mosquito.",
|
322 |
-
"MALU_NATEL_FREIRE_WEBER",
|
323 |
-
null,
|
324 |
-
"brpr"
|
325 |
-
]
|
326 |
-
],
|
327 |
-
"eval_split_max_size": 256,
|
328 |
-
"eval_split_size": 0.01,
|
329 |
-
"use_speaker_weighted_sampler": false,
|
330 |
-
"speaker_weighted_sampler_alpha": 1.0,
|
331 |
-
"use_language_weighted_sampler": false,
|
332 |
-
"language_weighted_sampler_alpha": 1.0,
|
333 |
-
"use_length_weighted_sampler": false,
|
334 |
-
"length_weighted_sampler_alpha": 1.0,
|
335 |
-
"model_args": {
|
336 |
-
"num_chars": 266,
|
337 |
-
"out_channels": 513,
|
338 |
-
"spec_segment_size": 62,
|
339 |
-
"hidden_channels": 192,
|
340 |
-
"use_adaptive_weight_text_encoder": false,
|
341 |
-
"use_perfect_class_batch_sampler": true,
|
342 |
-
"perfect_class_batch_sampler_key": "language",
|
343 |
-
"hidden_channels_ffn_text_encoder": 768,
|
344 |
-
"num_heads_text_encoder": 2,
|
345 |
-
"num_layers_text_encoder": 10,
|
346 |
-
"kernel_size_text_encoder": 3,
|
347 |
-
"dropout_p_text_encoder": 0.1,
|
348 |
-
"dropout_p_duration_predictor": 0.5,
|
349 |
-
"kernel_size_posterior_encoder": 5,
|
350 |
-
"dilation_rate_posterior_encoder": 1,
|
351 |
-
"num_layers_posterior_encoder": 16,
|
352 |
-
"kernel_size_flow": 5,
|
353 |
-
"dilation_rate_flow": 1,
|
354 |
-
"num_layers_flow": 4,
|
355 |
-
"resblock_type_decoder": "2",
|
356 |
-
"resblock_kernel_sizes_decoder": [
|
357 |
-
3,
|
358 |
-
7,
|
359 |
-
11
|
360 |
-
],
|
361 |
-
"resblock_dilation_sizes_decoder": [
|
362 |
-
[
|
363 |
-
1,
|
364 |
-
3,
|
365 |
-
5
|
366 |
-
],
|
367 |
-
[
|
368 |
-
1,
|
369 |
-
3,
|
370 |
-
5
|
371 |
-
],
|
372 |
-
[
|
373 |
-
1,
|
374 |
-
3,
|
375 |
-
5
|
376 |
-
]
|
377 |
-
],
|
378 |
-
"upsample_rates_decoder": [
|
379 |
-
8,
|
380 |
-
8,
|
381 |
-
2,
|
382 |
-
2
|
383 |
-
],
|
384 |
-
"upsample_initial_channel_decoder": 512,
|
385 |
-
"upsample_kernel_sizes_decoder": [
|
386 |
-
16,
|
387 |
-
16,
|
388 |
-
4,
|
389 |
-
4
|
390 |
-
],
|
391 |
-
"periods_multi_period_discriminator": [
|
392 |
-
2,
|
393 |
-
3,
|
394 |
-
5,
|
395 |
-
7,
|
396 |
-
11
|
397 |
-
],
|
398 |
-
"use_sdp": true,
|
399 |
-
"noise_scale": 1.0,
|
400 |
-
"inference_noise_scale": 0.667,
|
401 |
-
"length_scale": 1,
|
402 |
-
"noise_scale_dp": 1.0,
|
403 |
-
"inference_noise_scale_dp": 1.0,
|
404 |
-
"max_inference_len": null,
|
405 |
-
"init_discriminator": true,
|
406 |
-
"use_spectral_norm_disriminator": false,
|
407 |
-
"use_speaker_embedding": false,
|
408 |
-
"num_speakers": 0,
|
409 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
|
410 |
-
"d_vector_file": [
|
411 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
412 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
413 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
414 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
415 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
416 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
417 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
418 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
419 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
420 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
421 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
422 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
423 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
424 |
-
],
|
425 |
-
"speaker_embedding_channels": 256,
|
426 |
-
"use_d_vector_file": true,
|
427 |
-
"d_vector_dim": 512,
|
428 |
-
"detach_dp_input": true,
|
429 |
-
"use_language_embedding": true,
|
430 |
-
"embedded_language_dim": 4,
|
431 |
-
"num_languages": 0,
|
432 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
|
433 |
-
"use_speaker_encoder_as_loss": false,
|
434 |
-
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
435 |
-
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
436 |
-
"condition_dp_on_speaker": true,
|
437 |
-
"freeze_encoder": false,
|
438 |
-
"freeze_DP": false,
|
439 |
-
"freeze_PE": false,
|
440 |
-
"freeze_flow_decoder": false,
|
441 |
-
"freeze_waveform_decoder": false,
|
442 |
-
"encoder_sample_rate": null,
|
443 |
-
"interpolate_z": true,
|
444 |
-
"reinit_DP": false,
|
445 |
-
"reinit_text_encoder": false
|
446 |
-
},
|
447 |
-
"lr_gen": 0.0002,
|
448 |
-
"lr_disc": 0.0002,
|
449 |
-
"lr_scheduler_gen": "ExponentialLR",
|
450 |
-
"lr_scheduler_gen_params": {
|
451 |
-
"gamma": 0.999875,
|
452 |
-
"last_epoch": -1
|
453 |
-
},
|
454 |
-
"lr_scheduler_disc": "ExponentialLR",
|
455 |
-
"lr_scheduler_disc_params": {
|
456 |
-
"gamma": 0.999875,
|
457 |
-
"last_epoch": -1
|
458 |
-
},
|
459 |
-
"kl_loss_alpha": 1.0,
|
460 |
-
"disc_loss_alpha": 1.0,
|
461 |
-
"gen_loss_alpha": 1.0,
|
462 |
-
"feat_loss_alpha": 1.0,
|
463 |
-
"mel_loss_alpha": 45.0,
|
464 |
-
"dur_loss_alpha": 1.0,
|
465 |
-
"speaker_encoder_loss_alpha": 9.0,
|
466 |
-
"return_wav": true,
|
467 |
-
"use_weighted_sampler": true,
|
468 |
-
"weighted_sampler_attrs": {
|
469 |
-
"language": 1.0
|
470 |
-
},
|
471 |
-
"weighted_sampler_multipliers": {},
|
472 |
-
"r": 1,
|
473 |
-
"num_speakers": 0,
|
474 |
-
"use_speaker_embedding": false,
|
475 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
|
476 |
-
"speaker_embedding_channels": 256,
|
477 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
|
478 |
-
"use_language_embedding": true,
|
479 |
-
"use_d_vector_file": true,
|
480 |
-
"d_vector_file": [
|
481 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
482 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
483 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
484 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
485 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
486 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
487 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
488 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
489 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
490 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
491 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
492 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
493 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
494 |
-
],
|
495 |
-
"d_vector_dim": 512
|
496 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"bral": 0,
|
3 |
-
"bralemanha": 1,
|
4 |
-
"brba": 2,
|
5 |
-
"brce": 3,
|
6 |
-
"brgo": 4,
|
7 |
-
"brmg": 5,
|
8 |
-
"brpb": 6,
|
9 |
-
"brpe": 7,
|
10 |
-
"brportugal": 8,
|
11 |
-
"brpr": 9,
|
12 |
-
"brrj": 10,
|
13 |
-
"brrs": 11,
|
14 |
-
"brsp": 12
|
15 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
|
3 |
-
size 3296
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py
DELETED
@@ -1,352 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
import torch
|
4 |
-
from trainer import Trainer, TrainerArgs
|
5 |
-
|
6 |
-
from TTS.bin.compute_embeddings import compute_embeddings
|
7 |
-
from TTS.bin.resample import resample_files
|
8 |
-
from TTS.config.shared_configs import BaseDatasetConfig
|
9 |
-
from TTS.tts.configs.vits_config import VitsConfig
|
10 |
-
from TTS.tts.datasets import load_tts_samples
|
11 |
-
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
|
12 |
-
from TTS.utils.downloaders import download_libri_tts
|
13 |
-
from torch.utils.data import DataLoader
|
14 |
-
from TTS.utils.samplers import PerfectBatchSampler
|
15 |
-
torch.set_num_threads(24)
|
16 |
-
|
17 |
-
# pylint: disable=W0105
|
18 |
-
"""
|
19 |
-
This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
|
20 |
-
YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
|
21 |
-
"""
|
22 |
-
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
|
23 |
-
|
24 |
-
# Name of the run for the Trainer
|
25 |
-
RUN_NAME = "YourTTS-Baseline-PT"
|
26 |
-
|
27 |
-
# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
|
28 |
-
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
-
|
30 |
-
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
-
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
32 |
-
|
33 |
-
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
-
SKIP_TRAIN_EPOCH = False
|
35 |
-
|
36 |
-
# Set here the batch size to be used in training and evaluation
|
37 |
-
BATCH_SIZE = 26
|
38 |
-
|
39 |
-
# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
|
40 |
-
# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
|
41 |
-
SAMPLE_RATE = 16000
|
42 |
-
|
43 |
-
|
44 |
-
DASHBOARD_LOGGER="tensorboard"
|
45 |
-
LOGGER_URI = None
|
46 |
-
|
47 |
-
DASHBOARD_LOGGER = "clearml"
|
48 |
-
LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
|
53 |
-
MAX_AUDIO_LEN_IN_SECONDS = float("inf")
|
54 |
-
|
55 |
-
# Define here the datasets config
|
56 |
-
brpb_train_config = BaseDatasetConfig(
|
57 |
-
formatter="coqui",
|
58 |
-
dataset_name="mupe",
|
59 |
-
meta_file_train="metadata_coqui_brpb.csv",
|
60 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
61 |
-
language="brpb"
|
62 |
-
)
|
63 |
-
|
64 |
-
brba_train_config = BaseDatasetConfig(
|
65 |
-
formatter="coqui",
|
66 |
-
dataset_name="mupe",
|
67 |
-
meta_file_train="metadata_coqui_brba.csv",
|
68 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
69 |
-
language="brba"
|
70 |
-
)
|
71 |
-
|
72 |
-
brportugal_train_config = BaseDatasetConfig(
|
73 |
-
formatter="coqui",
|
74 |
-
dataset_name="mupe",
|
75 |
-
meta_file_train="metadata_coqui_brportugal.csv",
|
76 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
77 |
-
language="brportugal"
|
78 |
-
)
|
79 |
-
|
80 |
-
brsp_train_config = BaseDatasetConfig(
|
81 |
-
formatter="coqui",
|
82 |
-
dataset_name="mupe",
|
83 |
-
meta_file_train="metadata_coqui_brsp.csv",
|
84 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
85 |
-
language="brsp"
|
86 |
-
)
|
87 |
-
|
88 |
-
brpe_train_config = BaseDatasetConfig(
|
89 |
-
formatter="coqui",
|
90 |
-
dataset_name="mupe",
|
91 |
-
meta_file_train="metadata_coqui_brpe.csv",
|
92 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
93 |
-
language="brpe"
|
94 |
-
)
|
95 |
-
|
96 |
-
brmg_train_config = BaseDatasetConfig(
|
97 |
-
formatter="coqui",
|
98 |
-
dataset_name="mupe",
|
99 |
-
meta_file_train="metadata_coqui_brmg.csv",
|
100 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
101 |
-
language="brmg"
|
102 |
-
)
|
103 |
-
|
104 |
-
brrj_train_config = BaseDatasetConfig(
|
105 |
-
formatter="coqui",
|
106 |
-
dataset_name="mupe",
|
107 |
-
meta_file_train="metadata_coqui_brrj.csv",
|
108 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
109 |
-
language="brrj"
|
110 |
-
)
|
111 |
-
|
112 |
-
brce_train_config = BaseDatasetConfig(
|
113 |
-
formatter="coqui",
|
114 |
-
dataset_name="mupe",
|
115 |
-
meta_file_train="metadata_coqui_brce.csv",
|
116 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
117 |
-
language="brce"
|
118 |
-
)
|
119 |
-
|
120 |
-
brrs_train_config = BaseDatasetConfig(
|
121 |
-
formatter="coqui",
|
122 |
-
dataset_name="mupe",
|
123 |
-
meta_file_train="metadata_coqui_brrs.csv",
|
124 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
125 |
-
language="brrs"
|
126 |
-
)
|
127 |
-
|
128 |
-
bralemanha_train_config = BaseDatasetConfig(
|
129 |
-
formatter="coqui",
|
130 |
-
dataset_name="mupe",
|
131 |
-
meta_file_train="metadata_coqui_bralemanha.csv",
|
132 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
133 |
-
language="bralemanha"
|
134 |
-
)
|
135 |
-
|
136 |
-
brgo_train_config = BaseDatasetConfig(
|
137 |
-
formatter="coqui",
|
138 |
-
dataset_name="mupe",
|
139 |
-
meta_file_train="metadata_coqui_brgo.csv",
|
140 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
141 |
-
language="brgo"
|
142 |
-
)
|
143 |
-
|
144 |
-
bral_train_config = BaseDatasetConfig(
|
145 |
-
formatter="coqui",
|
146 |
-
dataset_name="mupe",
|
147 |
-
meta_file_train="metadata_coqui_bral.csv",
|
148 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
149 |
-
language="bral"
|
150 |
-
)
|
151 |
-
|
152 |
-
brpr_train_config = BaseDatasetConfig(
|
153 |
-
formatter="coqui",
|
154 |
-
dataset_name="mupe",
|
155 |
-
meta_file_train="metadata_coqui_brpr.csv",
|
156 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
157 |
-
language="brpr"
|
158 |
-
)
|
159 |
-
|
160 |
-
bres_train_config = BaseDatasetConfig(
|
161 |
-
formatter="coqui",
|
162 |
-
dataset_name="mupe",
|
163 |
-
meta_file_train="metadata_coqui_bres.csv",
|
164 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
165 |
-
language="bres"
|
166 |
-
)
|
167 |
-
|
168 |
-
brpi_train_config = BaseDatasetConfig(
|
169 |
-
formatter="coqui",
|
170 |
-
dataset_name="mupe",
|
171 |
-
meta_file_train="metadata_coqui_brpi.csv",
|
172 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
173 |
-
language="brpi"
|
174 |
-
)
|
175 |
-
|
176 |
-
# bres_train_config, brpi_train_config no files found
|
177 |
-
DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
|
178 |
-
|
179 |
-
|
180 |
-
### Extract speaker embeddings
|
181 |
-
SPEAKER_ENCODER_CHECKPOINT_PATH = (
|
182 |
-
"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
|
183 |
-
)
|
184 |
-
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
|
185 |
-
|
186 |
-
D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
|
187 |
-
|
188 |
-
# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
|
189 |
-
for dataset_conf in DATASETS_CONFIG_LIST:
|
190 |
-
# Check if the embeddings weren't already computed, if not compute it
|
191 |
-
embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
|
192 |
-
if not os.path.isfile(embeddings_file):
|
193 |
-
print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
|
194 |
-
compute_embeddings(
|
195 |
-
SPEAKER_ENCODER_CHECKPOINT_PATH,
|
196 |
-
SPEAKER_ENCODER_CONFIG_PATH,
|
197 |
-
embeddings_file,
|
198 |
-
old_speakers_file=None,
|
199 |
-
config_dataset_path=None,
|
200 |
-
formatter_name=dataset_conf.formatter,
|
201 |
-
dataset_name=dataset_conf.dataset_name,
|
202 |
-
dataset_path=dataset_conf.path,
|
203 |
-
meta_file_train=dataset_conf.meta_file_train,
|
204 |
-
meta_file_val=dataset_conf.meta_file_val,
|
205 |
-
disable_cuda=False,
|
206 |
-
no_eval=False,
|
207 |
-
)
|
208 |
-
D_VECTOR_FILES.append(embeddings_file)
|
209 |
-
|
210 |
-
|
211 |
-
# Audio config used in training.
|
212 |
-
audio_config = VitsAudioConfig(
|
213 |
-
sample_rate=SAMPLE_RATE,
|
214 |
-
hop_length=256,
|
215 |
-
win_length=1024,
|
216 |
-
fft_size=1024,
|
217 |
-
mel_fmin=0.0,
|
218 |
-
mel_fmax=None,
|
219 |
-
num_mels=80,
|
220 |
-
)
|
221 |
-
|
222 |
-
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
223 |
-
model_args = VitsArgs(
|
224 |
-
spec_segment_size=62,
|
225 |
-
hidden_channels=192,
|
226 |
-
hidden_channels_ffn_text_encoder=768,
|
227 |
-
num_heads_text_encoder=2,
|
228 |
-
num_layers_text_encoder=10,
|
229 |
-
kernel_size_text_encoder=3,
|
230 |
-
dropout_p_text_encoder=0.1,
|
231 |
-
d_vector_file=D_VECTOR_FILES,
|
232 |
-
use_d_vector_file=True,
|
233 |
-
d_vector_dim=512,
|
234 |
-
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
235 |
-
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
236 |
-
resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
237 |
-
# Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
|
238 |
-
use_speaker_encoder_as_loss=False,
|
239 |
-
# Useful parameters to enable multilingual training
|
240 |
-
use_language_embedding=True,
|
241 |
-
embedded_language_dim=4,
|
242 |
-
use_adaptive_weight_text_encoder=False,
|
243 |
-
use_perfect_class_batch_sampler=True,
|
244 |
-
perfect_class_batch_sampler_key="language"
|
245 |
-
)
|
246 |
-
|
247 |
-
# General training config, here you can change the batch size and others useful parameters
|
248 |
-
config = VitsConfig(
|
249 |
-
output_path=OUT_PATH,
|
250 |
-
model_args=model_args,
|
251 |
-
run_name=RUN_NAME,
|
252 |
-
project_name="SYNTACC",
|
253 |
-
run_description="""
|
254 |
-
- YourTTS with SYNTACC text encoder
|
255 |
-
""",
|
256 |
-
dashboard_logger=DASHBOARD_LOGGER,
|
257 |
-
logger_uri=LOGGER_URI,
|
258 |
-
audio=audio_config,
|
259 |
-
batch_size=BATCH_SIZE,
|
260 |
-
batch_group_size=48,
|
261 |
-
eval_batch_size=BATCH_SIZE,
|
262 |
-
num_loader_workers=8,
|
263 |
-
eval_split_max_size=256,
|
264 |
-
print_step=50,
|
265 |
-
plot_step=100,
|
266 |
-
log_model_step=1000,
|
267 |
-
save_step=5000,
|
268 |
-
save_n_checkpoints=2,
|
269 |
-
save_checkpoints=True,
|
270 |
-
# target_loss="loss_1",
|
271 |
-
print_eval=False,
|
272 |
-
use_phonemes=False,
|
273 |
-
phonemizer="espeak",
|
274 |
-
phoneme_language="en",
|
275 |
-
compute_input_seq_cache=True,
|
276 |
-
add_blank=True,
|
277 |
-
text_cleaner="multilingual_cleaners",
|
278 |
-
characters=CharactersConfig(
|
279 |
-
characters_class="TTS.tts.models.vits.VitsCharacters",
|
280 |
-
pad="_",
|
281 |
-
eos="&",
|
282 |
-
bos="*",
|
283 |
-
blank=None,
|
284 |
-
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
285 |
-
punctuations="\u2014!'(),-.:;?\u00bf ",
|
286 |
-
phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
287 |
-
is_unique=True,
|
288 |
-
is_sorted=True,
|
289 |
-
),
|
290 |
-
phoneme_cache_path=None,
|
291 |
-
precompute_num_workers=12,
|
292 |
-
start_by_longest=True,
|
293 |
-
datasets=DATASETS_CONFIG_LIST,
|
294 |
-
cudnn_benchmark=False,
|
295 |
-
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
|
296 |
-
mixed_precision=False,
|
297 |
-
test_sentences=[
|
298 |
-
#GUSTAVO: apenas pessoas do treino
|
299 |
-
["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
|
300 |
-
["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
|
301 |
-
["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
|
302 |
-
["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
|
303 |
-
["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
|
304 |
-
["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
|
305 |
-
["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
|
306 |
-
["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
|
307 |
-
["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
|
308 |
-
["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
|
309 |
-
["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
|
310 |
-
["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
|
311 |
-
["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
|
312 |
-
# ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
|
313 |
-
# ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
|
314 |
-
],
|
315 |
-
# Enable the weighted sampler
|
316 |
-
use_weighted_sampler=True,
|
317 |
-
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
318 |
-
# weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
|
319 |
-
weighted_sampler_attrs={"language": 1.0},
|
320 |
-
weighted_sampler_multipliers={
|
321 |
-
# "speaker_name": {
|
322 |
-
# you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
|
323 |
-
# It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
|
324 |
-
# The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
|
325 |
-
# 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
|
326 |
-
# }
|
327 |
-
},
|
328 |
-
# It defines the Speaker Consistency Loss (SCL) Ξ± to 9 like the YourTTS paper
|
329 |
-
speaker_encoder_loss_alpha=9.0,
|
330 |
-
)
|
331 |
-
|
332 |
-
# Load all the datasets samples and split traning and evaluation sets
|
333 |
-
train_samples, eval_samples = load_tts_samples(
|
334 |
-
config.datasets,
|
335 |
-
eval_split=True,
|
336 |
-
eval_split_max_size=config.eval_split_max_size,
|
337 |
-
eval_split_size=config.eval_split_size,
|
338 |
-
)
|
339 |
-
|
340 |
-
# Init the model
|
341 |
-
model = Vits.init_from_config(config)
|
342 |
-
|
343 |
-
# Init the trainer and π
|
344 |
-
trainer = Trainer(
|
345 |
-
TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
|
346 |
-
config,
|
347 |
-
output_path=OUT_PATH,
|
348 |
-
model=model,
|
349 |
-
train_samples=train_samples,
|
350 |
-
eval_samples=eval_samples,
|
351 |
-
)
|
352 |
-
trainer.fit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
|
3 |
-
size 1043216142
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
|
3 |
-
size 1043216142
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:5a584eb832a857f9a11180b34a84b81117d8690ed1e5fa39e4ff711cf6ffd7f7
|
3 |
-
size 1043220766
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:367ac46477805942658a7a78e8cf473409537967f9382a46249a8d11521ed3f9
|
3 |
-
size 1043220766
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json
DELETED
@@ -1,496 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"output_path": "/raid/datasets/MUPE/Experiments/runs",
|
3 |
-
"logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
|
4 |
-
"run_name": "YourTTS-Baseline-PT",
|
5 |
-
"project_name": "SYNTACC",
|
6 |
-
"run_description": "\n - YourTTS with SYNTACC text encoder\n ",
|
7 |
-
"print_step": 50,
|
8 |
-
"plot_step": 100,
|
9 |
-
"model_param_stats": false,
|
10 |
-
"wandb_entity": null,
|
11 |
-
"dashboard_logger": "clearml",
|
12 |
-
"save_on_interrupt": true,
|
13 |
-
"log_model_step": 1000,
|
14 |
-
"save_step": 5000,
|
15 |
-
"save_n_checkpoints": 2,
|
16 |
-
"save_checkpoints": true,
|
17 |
-
"save_all_best": false,
|
18 |
-
"save_best_after": 10000,
|
19 |
-
"target_loss": null,
|
20 |
-
"print_eval": false,
|
21 |
-
"test_delay_epochs": 0,
|
22 |
-
"run_eval": true,
|
23 |
-
"run_eval_steps": null,
|
24 |
-
"distributed_backend": "nccl",
|
25 |
-
"distributed_url": "tcp://localhost:54321",
|
26 |
-
"mixed_precision": false,
|
27 |
-
"precision": "fp16",
|
28 |
-
"epochs": 1000,
|
29 |
-
"batch_size": 26,
|
30 |
-
"eval_batch_size": 26,
|
31 |
-
"grad_clip": [
|
32 |
-
1000,
|
33 |
-
1000
|
34 |
-
],
|
35 |
-
"scheduler_after_epoch": true,
|
36 |
-
"lr": 0.001,
|
37 |
-
"optimizer": "AdamW",
|
38 |
-
"optimizer_params": {
|
39 |
-
"betas": [
|
40 |
-
0.8,
|
41 |
-
0.99
|
42 |
-
],
|
43 |
-
"eps": 1e-09,
|
44 |
-
"weight_decay": 0.01
|
45 |
-
},
|
46 |
-
"lr_scheduler": null,
|
47 |
-
"lr_scheduler_params": {},
|
48 |
-
"use_grad_scaler": false,
|
49 |
-
"allow_tf32": false,
|
50 |
-
"cudnn_enable": true,
|
51 |
-
"cudnn_deterministic": false,
|
52 |
-
"cudnn_benchmark": false,
|
53 |
-
"training_seed": 54321,
|
54 |
-
"model": "vits",
|
55 |
-
"num_loader_workers": 8,
|
56 |
-
"num_eval_loader_workers": 0,
|
57 |
-
"use_noise_augment": false,
|
58 |
-
"audio": {
|
59 |
-
"fft_size": 1024,
|
60 |
-
"sample_rate": 16000,
|
61 |
-
"win_length": 1024,
|
62 |
-
"hop_length": 256,
|
63 |
-
"num_mels": 80,
|
64 |
-
"mel_fmin": 0.0,
|
65 |
-
"mel_fmax": null
|
66 |
-
},
|
67 |
-
"use_phonemes": false,
|
68 |
-
"phonemizer": "espeak",
|
69 |
-
"phoneme_language": "en",
|
70 |
-
"compute_input_seq_cache": true,
|
71 |
-
"text_cleaner": "multilingual_cleaners",
|
72 |
-
"enable_eos_bos_chars": false,
|
73 |
-
"test_sentences_file": "",
|
74 |
-
"phoneme_cache_path": null,
|
75 |
-
"characters": {
|
76 |
-
"characters_class": "TTS.tts.models.vits.VitsCharacters",
|
77 |
-
"vocab_dict": null,
|
78 |
-
"pad": "_",
|
79 |
-
"eos": "&",
|
80 |
-
"bos": "*",
|
81 |
-
"blank": null,
|
82 |
-
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
83 |
-
"punctuations": "\u2014!'(),-.:;?\u00bf ",
|
84 |
-
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
85 |
-
"is_unique": true,
|
86 |
-
"is_sorted": true
|
87 |
-
},
|
88 |
-
"add_blank": true,
|
89 |
-
"batch_group_size": 48,
|
90 |
-
"loss_masking": null,
|
91 |
-
"min_audio_len": 1,
|
92 |
-
"max_audio_len": Infinity,
|
93 |
-
"min_text_len": 1,
|
94 |
-
"max_text_len": Infinity,
|
95 |
-
"compute_f0": false,
|
96 |
-
"compute_energy": false,
|
97 |
-
"compute_linear_spec": true,
|
98 |
-
"precompute_num_workers": 12,
|
99 |
-
"start_by_longest": true,
|
100 |
-
"shuffle": false,
|
101 |
-
"drop_last": false,
|
102 |
-
"datasets": [
|
103 |
-
{
|
104 |
-
"formatter": "coqui",
|
105 |
-
"dataset_name": "mupe",
|
106 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
107 |
-
"meta_file_train": "metadata_coqui_brpb.csv",
|
108 |
-
"ignored_speakers": null,
|
109 |
-
"language": "brpb",
|
110 |
-
"phonemizer": "",
|
111 |
-
"meta_file_val": "",
|
112 |
-
"meta_file_attn_mask": ""
|
113 |
-
},
|
114 |
-
{
|
115 |
-
"formatter": "coqui",
|
116 |
-
"dataset_name": "mupe",
|
117 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
118 |
-
"meta_file_train": "metadata_coqui_brba.csv",
|
119 |
-
"ignored_speakers": null,
|
120 |
-
"language": "brba",
|
121 |
-
"phonemizer": "",
|
122 |
-
"meta_file_val": "",
|
123 |
-
"meta_file_attn_mask": ""
|
124 |
-
},
|
125 |
-
{
|
126 |
-
"formatter": "coqui",
|
127 |
-
"dataset_name": "mupe",
|
128 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
129 |
-
"meta_file_train": "metadata_coqui_brportugal.csv",
|
130 |
-
"ignored_speakers": null,
|
131 |
-
"language": "brportugal",
|
132 |
-
"phonemizer": "",
|
133 |
-
"meta_file_val": "",
|
134 |
-
"meta_file_attn_mask": ""
|
135 |
-
},
|
136 |
-
{
|
137 |
-
"formatter": "coqui",
|
138 |
-
"dataset_name": "mupe",
|
139 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
140 |
-
"meta_file_train": "metadata_coqui_brsp.csv",
|
141 |
-
"ignored_speakers": null,
|
142 |
-
"language": "brsp",
|
143 |
-
"phonemizer": "",
|
144 |
-
"meta_file_val": "",
|
145 |
-
"meta_file_attn_mask": ""
|
146 |
-
},
|
147 |
-
{
|
148 |
-
"formatter": "coqui",
|
149 |
-
"dataset_name": "mupe",
|
150 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
151 |
-
"meta_file_train": "metadata_coqui_brpe.csv",
|
152 |
-
"ignored_speakers": null,
|
153 |
-
"language": "brpe",
|
154 |
-
"phonemizer": "",
|
155 |
-
"meta_file_val": "",
|
156 |
-
"meta_file_attn_mask": ""
|
157 |
-
},
|
158 |
-
{
|
159 |
-
"formatter": "coqui",
|
160 |
-
"dataset_name": "mupe",
|
161 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
162 |
-
"meta_file_train": "metadata_coqui_brmg.csv",
|
163 |
-
"ignored_speakers": null,
|
164 |
-
"language": "brmg",
|
165 |
-
"phonemizer": "",
|
166 |
-
"meta_file_val": "",
|
167 |
-
"meta_file_attn_mask": ""
|
168 |
-
},
|
169 |
-
{
|
170 |
-
"formatter": "coqui",
|
171 |
-
"dataset_name": "mupe",
|
172 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
173 |
-
"meta_file_train": "metadata_coqui_brrj.csv",
|
174 |
-
"ignored_speakers": null,
|
175 |
-
"language": "brrj",
|
176 |
-
"phonemizer": "",
|
177 |
-
"meta_file_val": "",
|
178 |
-
"meta_file_attn_mask": ""
|
179 |
-
},
|
180 |
-
{
|
181 |
-
"formatter": "coqui",
|
182 |
-
"dataset_name": "mupe",
|
183 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
184 |
-
"meta_file_train": "metadata_coqui_brce.csv",
|
185 |
-
"ignored_speakers": null,
|
186 |
-
"language": "brce",
|
187 |
-
"phonemizer": "",
|
188 |
-
"meta_file_val": "",
|
189 |
-
"meta_file_attn_mask": ""
|
190 |
-
},
|
191 |
-
{
|
192 |
-
"formatter": "coqui",
|
193 |
-
"dataset_name": "mupe",
|
194 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
195 |
-
"meta_file_train": "metadata_coqui_brrs.csv",
|
196 |
-
"ignored_speakers": null,
|
197 |
-
"language": "brrs",
|
198 |
-
"phonemizer": "",
|
199 |
-
"meta_file_val": "",
|
200 |
-
"meta_file_attn_mask": ""
|
201 |
-
},
|
202 |
-
{
|
203 |
-
"formatter": "coqui",
|
204 |
-
"dataset_name": "mupe",
|
205 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
206 |
-
"meta_file_train": "metadata_coqui_bralemanha.csv",
|
207 |
-
"ignored_speakers": null,
|
208 |
-
"language": "bralemanha",
|
209 |
-
"phonemizer": "",
|
210 |
-
"meta_file_val": "",
|
211 |
-
"meta_file_attn_mask": ""
|
212 |
-
},
|
213 |
-
{
|
214 |
-
"formatter": "coqui",
|
215 |
-
"dataset_name": "mupe",
|
216 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
217 |
-
"meta_file_train": "metadata_coqui_brgo.csv",
|
218 |
-
"ignored_speakers": null,
|
219 |
-
"language": "brgo",
|
220 |
-
"phonemizer": "",
|
221 |
-
"meta_file_val": "",
|
222 |
-
"meta_file_attn_mask": ""
|
223 |
-
},
|
224 |
-
{
|
225 |
-
"formatter": "coqui",
|
226 |
-
"dataset_name": "mupe",
|
227 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
228 |
-
"meta_file_train": "metadata_coqui_bral.csv",
|
229 |
-
"ignored_speakers": null,
|
230 |
-
"language": "bral",
|
231 |
-
"phonemizer": "",
|
232 |
-
"meta_file_val": "",
|
233 |
-
"meta_file_attn_mask": ""
|
234 |
-
},
|
235 |
-
{
|
236 |
-
"formatter": "coqui",
|
237 |
-
"dataset_name": "mupe",
|
238 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
239 |
-
"meta_file_train": "metadata_coqui_brpr.csv",
|
240 |
-
"ignored_speakers": null,
|
241 |
-
"language": "brpr",
|
242 |
-
"phonemizer": "",
|
243 |
-
"meta_file_val": "",
|
244 |
-
"meta_file_attn_mask": ""
|
245 |
-
}
|
246 |
-
],
|
247 |
-
"test_sentences": [
|
248 |
-
[
|
249 |
-
"Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
|
250 |
-
"EDILEINE_FONSECA",
|
251 |
-
null,
|
252 |
-
"brsp"
|
253 |
-
],
|
254 |
-
[
|
255 |
-
"Quem semeia ventos, colhe tempestades.",
|
256 |
-
"JOSE_PAULO_DE_ARAUJO",
|
257 |
-
null,
|
258 |
-
"brpb"
|
259 |
-
],
|
260 |
-
[
|
261 |
-
"O olho do dono \u00e9 que engorda o gado.",
|
262 |
-
"VITOR_RAFAEL_OLIVEIRA_ALVES",
|
263 |
-
null,
|
264 |
-
"brba"
|
265 |
-
],
|
266 |
-
[
|
267 |
-
"\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
|
268 |
-
"MARIA_AURORA_FELIX",
|
269 |
-
null,
|
270 |
-
"brportugal"
|
271 |
-
],
|
272 |
-
[
|
273 |
-
"Quem espera sempre alcan\u00e7a.",
|
274 |
-
"ANTONIO_DE_AMORIM_COSTA",
|
275 |
-
null,
|
276 |
-
"brpe"
|
277 |
-
],
|
278 |
-
[
|
279 |
-
"Cada macaco no seu galho.",
|
280 |
-
"ALCIDES_DE_LIMA",
|
281 |
-
null,
|
282 |
-
"brmg"
|
283 |
-
],
|
284 |
-
[
|
285 |
-
"Em terra de cego, quem tem um olho \u00e9 rei.",
|
286 |
-
"ALUISIO_SOARES_DE_SOUSA",
|
287 |
-
null,
|
288 |
-
"brrj"
|
289 |
-
],
|
290 |
-
[
|
291 |
-
"A ocasi\u00e3o faz o ladr\u00e3o.",
|
292 |
-
"FRANCISCO_JOSE_MOREIRA_MOTA",
|
293 |
-
null,
|
294 |
-
"brce"
|
295 |
-
],
|
296 |
-
[
|
297 |
-
"De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
|
298 |
-
"EVALDO_ANDRADA_CORREA",
|
299 |
-
null,
|
300 |
-
"brrs"
|
301 |
-
],
|
302 |
-
[
|
303 |
-
"Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
|
304 |
-
"DORIS_ALEXANDER",
|
305 |
-
null,
|
306 |
-
"bralemanha"
|
307 |
-
],
|
308 |
-
[
|
309 |
-
"Quem n\u00e3o arrisca, n\u00e3o petisca.",
|
310 |
-
"DONALDO_LUIZ_DE_ALMEIDA",
|
311 |
-
null,
|
312 |
-
"brgo"
|
313 |
-
],
|
314 |
-
[
|
315 |
-
"A uni\u00e3o faz a for\u00e7a.",
|
316 |
-
"GERONCIO_HENRIQUE_NETO",
|
317 |
-
null,
|
318 |
-
"bral"
|
319 |
-
],
|
320 |
-
[
|
321 |
-
"Em boca fechada n\u00e3o entra mosquito.",
|
322 |
-
"MALU_NATEL_FREIRE_WEBER",
|
323 |
-
null,
|
324 |
-
"brpr"
|
325 |
-
]
|
326 |
-
],
|
327 |
-
"eval_split_max_size": 256,
|
328 |
-
"eval_split_size": 0.01,
|
329 |
-
"use_speaker_weighted_sampler": false,
|
330 |
-
"speaker_weighted_sampler_alpha": 1.0,
|
331 |
-
"use_language_weighted_sampler": false,
|
332 |
-
"language_weighted_sampler_alpha": 1.0,
|
333 |
-
"use_length_weighted_sampler": false,
|
334 |
-
"length_weighted_sampler_alpha": 1.0,
|
335 |
-
"model_args": {
|
336 |
-
"num_chars": 266,
|
337 |
-
"out_channels": 513,
|
338 |
-
"spec_segment_size": 62,
|
339 |
-
"hidden_channels": 192,
|
340 |
-
"use_adaptive_weight_text_encoder": false,
|
341 |
-
"use_perfect_class_batch_sampler": true,
|
342 |
-
"perfect_class_batch_sampler_key": "language",
|
343 |
-
"hidden_channels_ffn_text_encoder": 768,
|
344 |
-
"num_heads_text_encoder": 2,
|
345 |
-
"num_layers_text_encoder": 10,
|
346 |
-
"kernel_size_text_encoder": 3,
|
347 |
-
"dropout_p_text_encoder": 0.1,
|
348 |
-
"dropout_p_duration_predictor": 0.5,
|
349 |
-
"kernel_size_posterior_encoder": 5,
|
350 |
-
"dilation_rate_posterior_encoder": 1,
|
351 |
-
"num_layers_posterior_encoder": 16,
|
352 |
-
"kernel_size_flow": 5,
|
353 |
-
"dilation_rate_flow": 1,
|
354 |
-
"num_layers_flow": 4,
|
355 |
-
"resblock_type_decoder": "2",
|
356 |
-
"resblock_kernel_sizes_decoder": [
|
357 |
-
3,
|
358 |
-
7,
|
359 |
-
11
|
360 |
-
],
|
361 |
-
"resblock_dilation_sizes_decoder": [
|
362 |
-
[
|
363 |
-
1,
|
364 |
-
3,
|
365 |
-
5
|
366 |
-
],
|
367 |
-
[
|
368 |
-
1,
|
369 |
-
3,
|
370 |
-
5
|
371 |
-
],
|
372 |
-
[
|
373 |
-
1,
|
374 |
-
3,
|
375 |
-
5
|
376 |
-
]
|
377 |
-
],
|
378 |
-
"upsample_rates_decoder": [
|
379 |
-
8,
|
380 |
-
8,
|
381 |
-
2,
|
382 |
-
2
|
383 |
-
],
|
384 |
-
"upsample_initial_channel_decoder": 512,
|
385 |
-
"upsample_kernel_sizes_decoder": [
|
386 |
-
16,
|
387 |
-
16,
|
388 |
-
4,
|
389 |
-
4
|
390 |
-
],
|
391 |
-
"periods_multi_period_discriminator": [
|
392 |
-
2,
|
393 |
-
3,
|
394 |
-
5,
|
395 |
-
7,
|
396 |
-
11
|
397 |
-
],
|
398 |
-
"use_sdp": true,
|
399 |
-
"noise_scale": 1.0,
|
400 |
-
"inference_noise_scale": 0.667,
|
401 |
-
"length_scale": 1,
|
402 |
-
"noise_scale_dp": 1.0,
|
403 |
-
"inference_noise_scale_dp": 1.0,
|
404 |
-
"max_inference_len": null,
|
405 |
-
"init_discriminator": true,
|
406 |
-
"use_spectral_norm_disriminator": false,
|
407 |
-
"use_speaker_embedding": false,
|
408 |
-
"num_speakers": 0,
|
409 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
|
410 |
-
"d_vector_file": [
|
411 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
412 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
413 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
414 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
415 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
416 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
417 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
418 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
419 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
420 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
421 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
422 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
423 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
424 |
-
],
|
425 |
-
"speaker_embedding_channels": 256,
|
426 |
-
"use_d_vector_file": true,
|
427 |
-
"d_vector_dim": 512,
|
428 |
-
"detach_dp_input": true,
|
429 |
-
"use_language_embedding": true,
|
430 |
-
"embedded_language_dim": 4,
|
431 |
-
"num_languages": 0,
|
432 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
|
433 |
-
"use_speaker_encoder_as_loss": false,
|
434 |
-
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
435 |
-
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
436 |
-
"condition_dp_on_speaker": true,
|
437 |
-
"freeze_encoder": false,
|
438 |
-
"freeze_DP": false,
|
439 |
-
"freeze_PE": false,
|
440 |
-
"freeze_flow_decoder": false,
|
441 |
-
"freeze_waveform_decoder": false,
|
442 |
-
"encoder_sample_rate": null,
|
443 |
-
"interpolate_z": true,
|
444 |
-
"reinit_DP": false,
|
445 |
-
"reinit_text_encoder": false
|
446 |
-
},
|
447 |
-
"lr_gen": 0.0002,
|
448 |
-
"lr_disc": 0.0002,
|
449 |
-
"lr_scheduler_gen": "ExponentialLR",
|
450 |
-
"lr_scheduler_gen_params": {
|
451 |
-
"gamma": 0.999875,
|
452 |
-
"last_epoch": -1
|
453 |
-
},
|
454 |
-
"lr_scheduler_disc": "ExponentialLR",
|
455 |
-
"lr_scheduler_disc_params": {
|
456 |
-
"gamma": 0.999875,
|
457 |
-
"last_epoch": -1
|
458 |
-
},
|
459 |
-
"kl_loss_alpha": 1.0,
|
460 |
-
"disc_loss_alpha": 1.0,
|
461 |
-
"gen_loss_alpha": 1.0,
|
462 |
-
"feat_loss_alpha": 1.0,
|
463 |
-
"mel_loss_alpha": 45.0,
|
464 |
-
"dur_loss_alpha": 1.0,
|
465 |
-
"speaker_encoder_loss_alpha": 9.0,
|
466 |
-
"return_wav": true,
|
467 |
-
"use_weighted_sampler": true,
|
468 |
-
"weighted_sampler_attrs": {
|
469 |
-
"language": 1.0
|
470 |
-
},
|
471 |
-
"weighted_sampler_multipliers": {},
|
472 |
-
"r": 1,
|
473 |
-
"num_speakers": 0,
|
474 |
-
"use_speaker_embedding": false,
|
475 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
|
476 |
-
"speaker_embedding_channels": 256,
|
477 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
|
478 |
-
"use_language_embedding": true,
|
479 |
-
"use_d_vector_file": true,
|
480 |
-
"d_vector_file": [
|
481 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
482 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
483 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
484 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
485 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
486 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
487 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
488 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
489 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
490 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
491 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
492 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
493 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
494 |
-
],
|
495 |
-
"d_vector_dim": 512
|
496 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"bral": 0,
|
3 |
-
"bralemanha": 1,
|
4 |
-
"brba": 2,
|
5 |
-
"brce": 3,
|
6 |
-
"brgo": 4,
|
7 |
-
"brmg": 5,
|
8 |
-
"brpb": 6,
|
9 |
-
"brpe": 7,
|
10 |
-
"brportugal": 8,
|
11 |
-
"brpr": 9,
|
12 |
-
"brrj": 10,
|
13 |
-
"brrs": 11,
|
14 |
-
"brsp": 12
|
15 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
|
3 |
-
size 3296
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:5ddf81cb4061c7e47bd824c3ebb109cc02bc31ab79ee21e4e69d60d32aca454b
|
3 |
-
size 1794644
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/{checkpoint_185000.pth β checkpoint_195000.pth}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1044066458
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c552bdeff67502deab77d3f587269e090fac00dc991bcfba8dedfa21594d471
|
3 |
size 1044066458
|
Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/trainer_0_log.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:327601981f984533599c289f977acc81f9d7479999f14235302e6ad1a171d710
|
3 |
+
size 3401880
|
Experiments/train_syntacc_baseline.py
CHANGED
@@ -28,7 +28,7 @@ RUN_NAME = "YourTTS-Baseline-PT"
|
|
28 |
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
|
30 |
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
-
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-
|
32 |
|
33 |
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
SKIP_TRAIN_EPOCH = False
|
|
|
28 |
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
|
30 |
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
+
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/checkpoint_195000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
32 |
|
33 |
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
SKIP_TRAIN_EPOCH = False
|