keithhon commited on
Commit
1345ca3
1 Parent(s): 13663d5

Upload synthesizer/synthesize.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. synthesizer/synthesize.py +97 -0
synthesizer/synthesize.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import DataLoader
3
+ from synthesizer.hparams import hparams_debug_string
4
+ from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
5
+ from synthesizer.models.tacotron import Tacotron
6
+ from synthesizer.utils.text import text_to_sequence
7
+ from synthesizer.utils.symbols import symbols
8
+ import numpy as np
9
+ from pathlib import Path
10
+ from tqdm import tqdm
11
+ import platform
12
+
13
+ def run_synthesis(in_dir, out_dir, model_dir, hparams):
14
+ # This generates ground truth-aligned mels for vocoder training
15
+ synth_dir = Path(out_dir).joinpath("mels_gta")
16
+ synth_dir.mkdir(exist_ok=True)
17
+ print(hparams_debug_string())
18
+
19
+ # Check for GPU
20
+ if torch.cuda.is_available():
21
+ device = torch.device("cuda")
22
+ if hparams.synthesis_batch_size % torch.cuda.device_count() != 0:
23
+ raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!")
24
+ else:
25
+ device = torch.device("cpu")
26
+ print("Synthesizer using device:", device)
27
+
28
+ # Instantiate Tacotron model
29
+ model = Tacotron(embed_dims=hparams.tts_embed_dims,
30
+ num_chars=len(symbols),
31
+ encoder_dims=hparams.tts_encoder_dims,
32
+ decoder_dims=hparams.tts_decoder_dims,
33
+ n_mels=hparams.num_mels,
34
+ fft_bins=hparams.num_mels,
35
+ postnet_dims=hparams.tts_postnet_dims,
36
+ encoder_K=hparams.tts_encoder_K,
37
+ lstm_dims=hparams.tts_lstm_dims,
38
+ postnet_K=hparams.tts_postnet_K,
39
+ num_highways=hparams.tts_num_highways,
40
+ dropout=0., # Use zero dropout for gta mels
41
+ stop_threshold=hparams.tts_stop_threshold,
42
+ speaker_embedding_size=hparams.speaker_embedding_size).to(device)
43
+
44
+ # Load the weights
45
+ model_dir = Path(model_dir)
46
+ model_fpath = model_dir.joinpath(model_dir.stem).with_suffix(".pt")
47
+ print("\nLoading weights at %s" % model_fpath)
48
+ model.load(model_fpath)
49
+ print("Tacotron weights loaded from step %d" % model.step)
50
+
51
+ # Synthesize using same reduction factor as the model is currently trained
52
+ r = np.int32(model.r)
53
+
54
+ # Set model to eval mode (disable gradient and zoneout)
55
+ model.eval()
56
+
57
+ # Initialize the dataset
58
+ in_dir = Path(in_dir)
59
+ metadata_fpath = in_dir.joinpath("train.txt")
60
+ mel_dir = in_dir.joinpath("mels")
61
+ embed_dir = in_dir.joinpath("embeds")
62
+
63
+ dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
64
+ data_loader = DataLoader(dataset,
65
+ collate_fn=lambda batch: collate_synthesizer(batch, r, hparams),
66
+ batch_size=hparams.synthesis_batch_size,
67
+ num_workers=2 if platform.system() != "Windows" else 0,
68
+ shuffle=False,
69
+ pin_memory=True)
70
+
71
+ # Generate GTA mels
72
+ meta_out_fpath = Path(out_dir).joinpath("synthesized.txt")
73
+ with open(meta_out_fpath, "w") as file:
74
+ for i, (texts, mels, embeds, idx) in tqdm(enumerate(data_loader), total=len(data_loader)):
75
+ texts = texts.to(device)
76
+ mels = mels.to(device)
77
+ embeds = embeds.to(device)
78
+
79
+ # Parallelize model onto GPUS using workaround due to python bug
80
+ if device.type == "cuda" and torch.cuda.device_count() > 1:
81
+ _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
82
+ else:
83
+ _, mels_out, _, _ = model(texts, mels, embeds)
84
+
85
+ for j, k in enumerate(idx):
86
+ # Note: outputs mel-spectrogram files and target ones have same names, just different folders
87
+ mel_filename = Path(synth_dir).joinpath(dataset.metadata[k][1])
88
+ mel_out = mels_out[j].detach().cpu().numpy().T
89
+
90
+ # Use the length of the ground truth mel to remove padding from the generated mels
91
+ mel_out = mel_out[:int(dataset.metadata[k][4])]
92
+
93
+ # Write the spectrogram to disk
94
+ np.save(mel_filename, mel_out, allow_pickle=False)
95
+
96
+ # Write metadata into the synthesized file
97
+ file.write("|".join(dataset.metadata[k]))