|
|
|
|
|
|
|
|
|
|
|
import os |
|
import numpy as np |
|
import torch |
|
import torchaudio |
|
|
|
|
|
def save_feature(process_dir, feature_dir, item, feature, overrides=True): |
|
"""Save features to path |
|
|
|
Args: |
|
process_dir (str): directory to store features |
|
feature_dir (_type_): directory to store one type of features (mel, energy, ...) |
|
item (str): uid |
|
feature (tensor): feature tensor |
|
overrides (bool, optional): whether to override existing files. Defaults to True. |
|
""" |
|
process_dir = os.path.join(process_dir, feature_dir) |
|
os.makedirs(process_dir, exist_ok=True) |
|
out_path = os.path.join(process_dir, item + ".npy") |
|
|
|
if os.path.exists(out_path): |
|
if overrides: |
|
np.save(out_path, feature) |
|
else: |
|
np.save(out_path, feature) |
|
|
|
|
|
def save_txt(process_dir, feature_dir, item, feature, overrides=True): |
|
process_dir = os.path.join(process_dir, feature_dir) |
|
os.makedirs(process_dir, exist_ok=True) |
|
out_path = os.path.join(process_dir, item + ".txt") |
|
|
|
if os.path.exists(out_path): |
|
if overrides: |
|
f = open(out_path, "w") |
|
f.writelines(feature) |
|
f.close() |
|
else: |
|
f = open(out_path, "w") |
|
f.writelines(feature) |
|
f.close() |
|
|
|
|
|
def save_audio(path, waveform, fs, add_silence=False, turn_up=False, volume_peak=0.9): |
|
"""Save audio to path with processing (turn up volume, add silence) |
|
Args: |
|
path (str): path to save audio |
|
waveform (numpy array): waveform to save |
|
fs (int): sampling rate |
|
add_silence (bool, optional): whether to add silence to beginning and end. Defaults to False. |
|
turn_up (bool, optional): whether to turn up volume. Defaults to False. |
|
volume_peak (float, optional): volume peak. Defaults to 0.9. |
|
""" |
|
if turn_up: |
|
|
|
ratio = volume_peak / max(waveform.max(), abs(waveform.min())) |
|
waveform = waveform * ratio |
|
|
|
if add_silence: |
|
silence_len = fs // 20 |
|
silence = np.zeros((silence_len,), dtype=waveform.dtype) |
|
result = np.concatenate([silence, waveform, silence]) |
|
waveform = result |
|
|
|
waveform = torch.as_tensor(waveform, dtype=torch.float32, device="cpu") |
|
if len(waveform.size()) == 1: |
|
waveform = waveform[None, :] |
|
elif waveform.size(0) != 1: |
|
|
|
waveform = torch.mean(waveform, dim=0, keepdim=True) |
|
torchaudio.save(path, waveform, fs, encoding="PCM_S", bits_per_sample=16) |
|
|
|
|
|
def save_torch_audio(process_dir, feature_dir, item, wav_torch, fs, overrides=True): |
|
"""Save torch audio to path without processing |
|
Args: |
|
process_dir (str): directory to store features |
|
feature_dir (_type_): directory to store one type of features (mel, energy, ...) |
|
item (str): uid |
|
wav_torch (tensor): feature tensor |
|
fs (int): sampling rate |
|
overrides (bool, optional): whether to override existing files. Defaults to True. |
|
""" |
|
if wav_torch.shape != 2: |
|
wav_torch = wav_torch.unsqueeze(0) |
|
|
|
process_dir = os.path.join(process_dir, feature_dir) |
|
os.makedirs(process_dir, exist_ok=True) |
|
out_path = os.path.join(process_dir, item + ".wav") |
|
|
|
torchaudio.save(out_path, wav_torch, fs) |
|
|
|
|
|
async def async_load_audio(path, sample_rate: int = 24000): |
|
r""" |
|
Args: |
|
path: The source loading path. |
|
sample_rate: The target sample rate, will automatically resample if necessary. |
|
|
|
Returns: |
|
waveform: The waveform object. Should be [1 x sequence_len]. |
|
""" |
|
|
|
async def use_torchaudio_load(path): |
|
return torchaudio.load(path) |
|
|
|
waveform, sr = await use_torchaudio_load(path) |
|
waveform = torch.mean(waveform, dim=0, keepdim=True) |
|
|
|
if sr != sample_rate: |
|
waveform = torchaudio.functional.resample(waveform, sr, sample_rate) |
|
|
|
if torch.any(torch.isnan(waveform) or torch.isinf(waveform)): |
|
raise ValueError("NaN or Inf found in waveform.") |
|
return waveform |
|
|
|
|
|
async def async_save_audio( |
|
path, |
|
waveform, |
|
sample_rate: int = 24000, |
|
add_silence: bool = False, |
|
volume_peak: float = 0.9, |
|
): |
|
r""" |
|
Args: |
|
path: The target saving path. |
|
waveform: The waveform object. Should be [n_channel x sequence_len]. |
|
sample_rate: Sample rate. |
|
add_silence: If ``true``, concat 0.05s silence to beginning and end. |
|
volume_peak: Turn up volume for larger number, vice versa. |
|
""" |
|
|
|
async def use_torchaudio_save(path, waveform, sample_rate): |
|
torchaudio.save( |
|
path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16 |
|
) |
|
|
|
waveform = torch.as_tensor(waveform, device="cpu", dtype=torch.float32) |
|
shape = waveform.size()[:-1] |
|
|
|
ratio = abs(volume_peak) / max(waveform.max(), abs(waveform.min())) |
|
waveform = waveform * ratio |
|
|
|
if add_silence: |
|
silence_len = sample_rate // 20 |
|
silence = torch.zeros((*shape, silence_len), dtype=waveform.type()) |
|
waveform = torch.concatenate((silence, waveform, silence), dim=-1) |
|
|
|
if waveform.dim() == 1: |
|
waveform = waveform[None] |
|
|
|
await use_torchaudio_save(path, waveform, sample_rate) |
|
|
|
|
|
def load_mel_extrema(cfg, dataset_name, split): |
|
dataset_dir = os.path.join( |
|
cfg.OUTPUT_PATH, |
|
"preprocess/{}_version".format(cfg.data.process_version), |
|
dataset_name, |
|
) |
|
|
|
min_file = os.path.join( |
|
dataset_dir, |
|
"mel_min_max", |
|
split.split("_")[-1], |
|
"mel_min.npy", |
|
) |
|
max_file = os.path.join( |
|
dataset_dir, |
|
"mel_min_max", |
|
split.split("_")[-1], |
|
"mel_max.npy", |
|
) |
|
mel_min = np.load(min_file) |
|
mel_max = np.load(max_file) |
|
return mel_min, mel_max |
|
|