Spaces:
Runtime error
Runtime error
Upload synthesizer/hparams.py with huggingface_hub
Browse files- synthesizer/hparams.py +92 -0
synthesizer/hparams.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
import pprint
|
3 |
+
|
4 |
+
class HParams(object):
|
5 |
+
def __init__(self, **kwargs): self.__dict__.update(kwargs)
|
6 |
+
def __setitem__(self, key, value): setattr(self, key, value)
|
7 |
+
def __getitem__(self, key): return getattr(self, key)
|
8 |
+
def __repr__(self): return pprint.pformat(self.__dict__)
|
9 |
+
|
10 |
+
def parse(self, string):
|
11 |
+
# Overrides hparams from a comma-separated string of name=value pairs
|
12 |
+
if len(string) > 0:
|
13 |
+
overrides = [s.split("=") for s in string.split(",")]
|
14 |
+
keys, values = zip(*overrides)
|
15 |
+
keys = list(map(str.strip, keys))
|
16 |
+
values = list(map(str.strip, values))
|
17 |
+
for k in keys:
|
18 |
+
self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
|
19 |
+
return self
|
20 |
+
|
21 |
+
hparams = HParams(
|
22 |
+
### Signal Processing (used in both synthesizer and vocoder)
|
23 |
+
sample_rate = 16000,
|
24 |
+
n_fft = 800,
|
25 |
+
num_mels = 80,
|
26 |
+
hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
|
27 |
+
win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
|
28 |
+
fmin = 55,
|
29 |
+
min_level_db = -100,
|
30 |
+
ref_level_db = 20,
|
31 |
+
max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.
|
32 |
+
preemphasis = 0.97, # Filter coefficient to use if preemphasize is True
|
33 |
+
preemphasize = True,
|
34 |
+
|
35 |
+
### Tacotron Text-to-Speech (TTS)
|
36 |
+
tts_embed_dims = 512, # Embedding dimension for the graphemes/phoneme inputs
|
37 |
+
tts_encoder_dims = 256,
|
38 |
+
tts_decoder_dims = 128,
|
39 |
+
tts_postnet_dims = 512,
|
40 |
+
tts_encoder_K = 5,
|
41 |
+
tts_lstm_dims = 1024,
|
42 |
+
tts_postnet_K = 5,
|
43 |
+
tts_num_highways = 4,
|
44 |
+
tts_dropout = 0.5,
|
45 |
+
tts_cleaner_names = ["english_cleaners"],
|
46 |
+
tts_stop_threshold = -3.4, # Value below which audio generation ends.
|
47 |
+
# For example, for a range of [-4, 4], this
|
48 |
+
# will terminate the sequence at the first
|
49 |
+
# frame that has all values < -3.4
|
50 |
+
|
51 |
+
### Tacotron Training
|
52 |
+
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
|
53 |
+
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
|
54 |
+
(2, 2e-4, 80_000, 12), #
|
55 |
+
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
|
56 |
+
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
|
57 |
+
(2, 1e-5, 640_000, 12)], # lr = learning rate
|
58 |
+
|
59 |
+
tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed
|
60 |
+
tts_eval_interval = 500, # Number of steps between model evaluation (sample generation)
|
61 |
+
# Set to -1 to generate after completing epoch, or 0 to disable
|
62 |
+
|
63 |
+
tts_eval_num_samples = 1, # Makes this number of samples
|
64 |
+
|
65 |
+
### Data Preprocessing
|
66 |
+
max_mel_frames = 900,
|
67 |
+
rescale = True,
|
68 |
+
rescaling_max = 0.9,
|
69 |
+
synthesis_batch_size = 16, # For vocoder preprocessing and inference.
|
70 |
+
|
71 |
+
### Mel Visualization and Griffin-Lim
|
72 |
+
signal_normalization = True,
|
73 |
+
power = 1.5,
|
74 |
+
griffin_lim_iters = 60,
|
75 |
+
|
76 |
+
### Audio processing options
|
77 |
+
fmax = 7600, # Should not exceed (sample_rate // 2)
|
78 |
+
allow_clipping_in_normalization = True, # Used when signal_normalization = True
|
79 |
+
clip_mels_length = True, # If true, discards samples exceeding max_mel_frames
|
80 |
+
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
|
81 |
+
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
|
82 |
+
# and [0, max_abs_value] if False
|
83 |
+
trim_silence = True, # Use with sample_rate of 16000 for best results
|
84 |
+
|
85 |
+
### SV2TTS
|
86 |
+
speaker_embedding_size = 256, # Dimension for the speaker embedding
|
87 |
+
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
|
88 |
+
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
|
89 |
+
)
|
90 |
+
|
91 |
+
def hparams_debug_string():
|
92 |
+
return str(hparams)
|