Spaces:
Runtime error
Runtime error
class Map(dict): | |
""" | |
Example: | |
m = Map({'first_name': 'Eduardo'}, last_name='Pool', age=24, sports=['Soccer']) | |
Credits to epool: | |
https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary | |
""" | |
def __init__(self, *args, **kwargs): | |
super(Map, self).__init__(*args, **kwargs) | |
for arg in args: | |
if isinstance(arg, dict): | |
for k, v in arg.items(): | |
self[k] = v | |
if kwargs: | |
for k, v in kwargs.iteritems(): | |
self[k] = v | |
def __getattr__(self, attr): | |
return self.get(attr) | |
def __setattr__(self, key, value): | |
self.__setitem__(key, value) | |
def __setitem__(self, key, value): | |
super(Map, self).__setitem__(key, value) | |
self.__dict__.update({key: value}) | |
def __delattr__(self, item): | |
self.__delitem__(item) | |
def __delitem__(self, key): | |
super(Map, self).__delitem__(key) | |
del self.__dict__[key] | |
# Default hyperparameters: | |
hparams = Map({ | |
'name': "wavenet_vocoder", | |
# Convenient model builder | |
'builder': "wavenet", | |
# Input type: | |
# 1. raw [-1, 1] | |
# 2. mulaw [-1, 1] | |
# 3. mulaw-quantize [0, mu] | |
# If input_type is raw or mulaw, network assumes scalar input and | |
# discretized mixture of logistic distributions output, otherwise one-hot | |
# input and softmax output are assumed. | |
# **NOTE**: if you change the one of the two parameters below, you need to | |
# re-run preprocessing before training. | |
'input_type': "raw", | |
'quantize_channels': 65536, # 65536 or 256 | |
# Audio: | |
'sample_rate': 16000, | |
# this is only valid for mulaw is True | |
'silence_threshold': 2, | |
'num_mels': 80, | |
'fmin': 125, | |
'fmax': 7600, | |
'fft_size': 1024, | |
# shift can be specified by either hop_size or frame_shift_ms | |
'hop_size': 256, | |
'frame_shift_ms': None, | |
'min_level_db': -100, | |
'ref_level_db': 20, | |
# whether to rescale waveform or not. | |
# Let x is an input waveform, rescaled waveform y is given by: | |
# y = x / np.abs(x).max() * rescaling_max | |
'rescaling': True, | |
'rescaling_max': 0.999, | |
# mel-spectrogram is normalized to [0, 1] for each utterance and clipping may | |
# happen depends on min_level_db and ref_level_db, causing clipping noise. | |
# If False, assertion is added to ensure no clipping happens.o0 | |
'allow_clipping_in_normalization': True, | |
# Mixture of logistic distributions: | |
'log_scale_min': float(-32.23619130191664), | |
# Model: | |
# This should equal to `quantize_channels` if mu-law quantize enabled | |
# otherwise num_mixture * 3 (pi, mean, log_scale) | |
'out_channels': 10 * 3, | |
'layers': 24, | |
'stacks': 4, | |
'residual_channels': 512, | |
'gate_channels': 512, # split into 2 gropus internally for gated activation | |
'skip_out_channels': 256, | |
'dropout': 1 - 0.95, | |
'kernel_size': 3, | |
# If True, apply weight normalization as same as DeepVoice3 | |
'weight_normalization': True, | |
# Use legacy code or not. Default is True since we already provided a model | |
# based on the legacy code that can generate high-quality audio. | |
# Ref: https://github.com/r9y9/wavenet_vocoder/pull/73 | |
'legacy': True, | |
# Local conditioning (set negative value to disable)) | |
'cin_channels': 80, | |
# If True, use transposed convolutions to upsample conditional features, | |
# otherwise repeat features to adjust time resolution | |
'upsample_conditional_features': True, | |
# should np.prod(upsample_scales) == hop_size | |
'upsample_scales': [4, 4, 4, 4], | |
# Freq axis kernel size for upsampling network | |
'freq_axis_kernel_size': 3, | |
# Global conditioning (set negative value to disable) | |
# currently limited for speaker embedding | |
# this should only be enabled for multi-speaker dataset | |
'gin_channels': -1, # i.e., speaker embedding dim | |
'n_speakers': -1, | |
# Data loader | |
'pin_memory': True, | |
'num_workers': 2, | |
# train/test | |
# test size can be specified as portion or num samples | |
'test_size': 0.0441, # 50 for CMU ARCTIC single speaker | |
'test_num_samples': None, | |
'random_state': 1234, | |
# Loss | |
# Training: | |
'batch_size': 2, | |
'adam_beta1': 0.9, | |
'adam_beta2': 0.999, | |
'adam_eps': 1e-8, | |
'amsgrad': False, | |
'initial_learning_rate': 1e-3, | |
# see lrschedule.py for available lr_schedule | |
'lr_schedule': "noam_learning_rate_decay", | |
'lr_schedule_kwargs': {}, # {"anneal_rate": 0.5, "anneal_interval": 50000}, | |
'nepochs': 2000, | |
'weight_decay': 0.0, | |
'clip_thresh': -1, | |
# max time steps can either be specified as sec or steps | |
# if both are None, then full audio samples are used in a batch | |
'max_time_sec': None, | |
'max_time_steps': 8000, | |
# Hold moving averaged parameters and use them for evaluation | |
'exponential_moving_average': True, | |
# averaged = decay * averaged + (1 - decay) * x | |
'ema_decay': 0.9999, | |
# Save | |
# per-step intervals | |
'checkpoint_interval': 10000, | |
'train_eval_interval': 10000, | |
# per-epoch interval | |
'test_eval_epoch_interval': 5, | |
'save_optimizer_state': True, | |
# Eval: | |
}) | |
def hparams_debug_string(): | |
values = hparams.values() | |
hp = [' %s: %s' % (name, values[name]) for name in sorted(values)] | |
return 'Hyperparameters:\n' + '\n'.join(hp) |