Spaces:
Runtime error
Runtime error
Upload synthesizer/synthesize.py with huggingface_hub
Browse files- synthesizer/synthesize.py +97 -0
synthesizer/synthesize.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.utils.data import DataLoader
|
3 |
+
from synthesizer.hparams import hparams_debug_string
|
4 |
+
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
|
5 |
+
from synthesizer.models.tacotron import Tacotron
|
6 |
+
from synthesizer.utils.text import text_to_sequence
|
7 |
+
from synthesizer.utils.symbols import symbols
|
8 |
+
import numpy as np
|
9 |
+
from pathlib import Path
|
10 |
+
from tqdm import tqdm
|
11 |
+
import platform
|
12 |
+
|
13 |
+
def run_synthesis(in_dir, out_dir, model_dir, hparams):
|
14 |
+
# This generates ground truth-aligned mels for vocoder training
|
15 |
+
synth_dir = Path(out_dir).joinpath("mels_gta")
|
16 |
+
synth_dir.mkdir(exist_ok=True)
|
17 |
+
print(hparams_debug_string())
|
18 |
+
|
19 |
+
# Check for GPU
|
20 |
+
if torch.cuda.is_available():
|
21 |
+
device = torch.device("cuda")
|
22 |
+
if hparams.synthesis_batch_size % torch.cuda.device_count() != 0:
|
23 |
+
raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!")
|
24 |
+
else:
|
25 |
+
device = torch.device("cpu")
|
26 |
+
print("Synthesizer using device:", device)
|
27 |
+
|
28 |
+
# Instantiate Tacotron model
|
29 |
+
model = Tacotron(embed_dims=hparams.tts_embed_dims,
|
30 |
+
num_chars=len(symbols),
|
31 |
+
encoder_dims=hparams.tts_encoder_dims,
|
32 |
+
decoder_dims=hparams.tts_decoder_dims,
|
33 |
+
n_mels=hparams.num_mels,
|
34 |
+
fft_bins=hparams.num_mels,
|
35 |
+
postnet_dims=hparams.tts_postnet_dims,
|
36 |
+
encoder_K=hparams.tts_encoder_K,
|
37 |
+
lstm_dims=hparams.tts_lstm_dims,
|
38 |
+
postnet_K=hparams.tts_postnet_K,
|
39 |
+
num_highways=hparams.tts_num_highways,
|
40 |
+
dropout=0., # Use zero dropout for gta mels
|
41 |
+
stop_threshold=hparams.tts_stop_threshold,
|
42 |
+
speaker_embedding_size=hparams.speaker_embedding_size).to(device)
|
43 |
+
|
44 |
+
# Load the weights
|
45 |
+
model_dir = Path(model_dir)
|
46 |
+
model_fpath = model_dir.joinpath(model_dir.stem).with_suffix(".pt")
|
47 |
+
print("\nLoading weights at %s" % model_fpath)
|
48 |
+
model.load(model_fpath)
|
49 |
+
print("Tacotron weights loaded from step %d" % model.step)
|
50 |
+
|
51 |
+
# Synthesize using same reduction factor as the model is currently trained
|
52 |
+
r = np.int32(model.r)
|
53 |
+
|
54 |
+
# Set model to eval mode (disable gradient and zoneout)
|
55 |
+
model.eval()
|
56 |
+
|
57 |
+
# Initialize the dataset
|
58 |
+
in_dir = Path(in_dir)
|
59 |
+
metadata_fpath = in_dir.joinpath("train.txt")
|
60 |
+
mel_dir = in_dir.joinpath("mels")
|
61 |
+
embed_dir = in_dir.joinpath("embeds")
|
62 |
+
|
63 |
+
dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
|
64 |
+
data_loader = DataLoader(dataset,
|
65 |
+
collate_fn=lambda batch: collate_synthesizer(batch, r, hparams),
|
66 |
+
batch_size=hparams.synthesis_batch_size,
|
67 |
+
num_workers=2 if platform.system() != "Windows" else 0,
|
68 |
+
shuffle=False,
|
69 |
+
pin_memory=True)
|
70 |
+
|
71 |
+
# Generate GTA mels
|
72 |
+
meta_out_fpath = Path(out_dir).joinpath("synthesized.txt")
|
73 |
+
with open(meta_out_fpath, "w") as file:
|
74 |
+
for i, (texts, mels, embeds, idx) in tqdm(enumerate(data_loader), total=len(data_loader)):
|
75 |
+
texts = texts.to(device)
|
76 |
+
mels = mels.to(device)
|
77 |
+
embeds = embeds.to(device)
|
78 |
+
|
79 |
+
# Parallelize model onto GPUS using workaround due to python bug
|
80 |
+
if device.type == "cuda" and torch.cuda.device_count() > 1:
|
81 |
+
_, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
|
82 |
+
else:
|
83 |
+
_, mels_out, _, _ = model(texts, mels, embeds)
|
84 |
+
|
85 |
+
for j, k in enumerate(idx):
|
86 |
+
# Note: outputs mel-spectrogram files and target ones have same names, just different folders
|
87 |
+
mel_filename = Path(synth_dir).joinpath(dataset.metadata[k][1])
|
88 |
+
mel_out = mels_out[j].detach().cpu().numpy().T
|
89 |
+
|
90 |
+
# Use the length of the ground truth mel to remove padding from the generated mels
|
91 |
+
mel_out = mel_out[:int(dataset.metadata[k][4])]
|
92 |
+
|
93 |
+
# Write the spectrogram to disk
|
94 |
+
np.save(mel_filename, mel_out, allow_pickle=False)
|
95 |
+
|
96 |
+
# Write metadata into the synthesized file
|
97 |
+
file.write("|".join(dataset.metadata[k]))
|