|
|
|
|
|
|
|
|
|
|
|
import torch |
|
import numpy as np |
|
from numpy import linalg as LA |
|
import librosa |
|
import soundfile as sf |
|
import librosa.filters |
|
|
|
|
|
def load_audio_torch(wave_file, fs): |
|
"""Load audio data into torch tensor |
|
|
|
Args: |
|
wave_file (str): path to wave file |
|
fs (int): sample rate |
|
|
|
Returns: |
|
audio (tensor): audio data in tensor |
|
fs (int): sample rate |
|
""" |
|
|
|
audio, sample_rate = librosa.load(wave_file, sr=fs, mono=True) |
|
|
|
assert len(audio) > 2 |
|
|
|
|
|
if np.issubdtype(audio.dtype, np.integer): |
|
max_mag = -np.iinfo(audio.dtype).min |
|
else: |
|
max_mag = max(np.amax(audio), -np.amin(audio)) |
|
max_mag = ( |
|
(2**31) + 1 |
|
if max_mag > (2**15) |
|
else ((2**15) + 1 if max_mag > 1.01 else 1.0) |
|
) |
|
|
|
|
|
audio = torch.FloatTensor(audio.astype(np.float32)) / max_mag |
|
|
|
if (torch.isnan(audio) | torch.isinf(audio)).any(): |
|
return [], sample_rate or fs or 48000 |
|
|
|
|
|
if fs is not None and fs != sample_rate: |
|
audio = torch.from_numpy( |
|
librosa.core.resample(audio.numpy(), orig_sr=sample_rate, target_sr=fs) |
|
) |
|
sample_rate = fs |
|
|
|
return audio, fs |
|
|
|
|
|
def _stft(y, cfg): |
|
return librosa.stft( |
|
y=y, n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.win_size |
|
) |
|
|
|
|
|
def energy(wav, cfg): |
|
D = _stft(wav, cfg) |
|
magnitudes = np.abs(D).T |
|
return LA.norm(magnitudes, axis=1) |
|
|
|
|
|
def get_energy_from_tacotron(audio, _stft): |
|
audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1) |
|
audio = torch.autograd.Variable(audio, requires_grad=False) |
|
mel, energy = _stft.mel_spectrogram(audio) |
|
energy = torch.squeeze(energy, 0).numpy().astype(np.float32) |
|
return mel, energy |
|
|