Spaces:
Runtime error
Runtime error
Upload encoder/audio.py with huggingface_hub
Browse files- encoder/audio.py +117 -0
encoder/audio.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from scipy.ndimage.morphology import binary_dilation
|
2 |
+
from encoder.params_data import *
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Optional, Union
|
5 |
+
from warnings import warn
|
6 |
+
import numpy as np
|
7 |
+
import librosa
|
8 |
+
import struct
|
9 |
+
|
10 |
+
try:
|
11 |
+
import webrtcvad
|
12 |
+
except:
|
13 |
+
warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
|
14 |
+
webrtcvad=None
|
15 |
+
|
16 |
+
int16_max = (2 ** 15) - 1
|
17 |
+
|
18 |
+
|
19 |
+
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
|
20 |
+
source_sr: Optional[int] = None,
|
21 |
+
normalize: Optional[bool] = True,
|
22 |
+
trim_silence: Optional[bool] = True):
|
23 |
+
"""
|
24 |
+
Applies the preprocessing operations used in training the Speaker Encoder to a waveform
|
25 |
+
either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
|
26 |
+
|
27 |
+
:param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
|
28 |
+
just .wav), either the waveform as a numpy array of floats.
|
29 |
+
:param source_sr: if passing an audio waveform, the sampling rate of the waveform before
|
30 |
+
preprocessing. After preprocessing, the waveform's sampling rate will match the data
|
31 |
+
hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
|
32 |
+
this argument will be ignored.
|
33 |
+
"""
|
34 |
+
# Load the wav from disk if needed
|
35 |
+
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
36 |
+
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
|
37 |
+
else:
|
38 |
+
wav = fpath_or_wav
|
39 |
+
|
40 |
+
# Resample the wav if needed
|
41 |
+
if source_sr is not None and source_sr != sampling_rate:
|
42 |
+
wav = librosa.resample(wav, source_sr, sampling_rate)
|
43 |
+
|
44 |
+
# Apply the preprocessing: normalize volume and shorten long silences
|
45 |
+
if normalize:
|
46 |
+
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
|
47 |
+
if webrtcvad and trim_silence:
|
48 |
+
wav = trim_long_silences(wav)
|
49 |
+
|
50 |
+
return wav
|
51 |
+
|
52 |
+
|
53 |
+
def wav_to_mel_spectrogram(wav):
|
54 |
+
"""
|
55 |
+
Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
|
56 |
+
Note: this not a log-mel spectrogram.
|
57 |
+
"""
|
58 |
+
frames = librosa.feature.melspectrogram(
|
59 |
+
wav,
|
60 |
+
sampling_rate,
|
61 |
+
n_fft=int(sampling_rate * mel_window_length / 1000),
|
62 |
+
hop_length=int(sampling_rate * mel_window_step / 1000),
|
63 |
+
n_mels=mel_n_channels
|
64 |
+
)
|
65 |
+
return frames.astype(np.float32).T
|
66 |
+
|
67 |
+
|
68 |
+
def trim_long_silences(wav):
|
69 |
+
"""
|
70 |
+
Ensures that segments without voice in the waveform remain no longer than a
|
71 |
+
threshold determined by the VAD parameters in params.py.
|
72 |
+
|
73 |
+
:param wav: the raw waveform as a numpy array of floats
|
74 |
+
:return: the same waveform with silences trimmed away (length <= original wav length)
|
75 |
+
"""
|
76 |
+
# Compute the voice detection window size
|
77 |
+
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
78 |
+
|
79 |
+
# Trim the end of the audio to have a multiple of the window size
|
80 |
+
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
|
81 |
+
|
82 |
+
# Convert the float waveform to 16-bit mono PCM
|
83 |
+
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
|
84 |
+
|
85 |
+
# Perform voice activation detection
|
86 |
+
voice_flags = []
|
87 |
+
vad = webrtcvad.Vad(mode=3)
|
88 |
+
for window_start in range(0, len(wav), samples_per_window):
|
89 |
+
window_end = window_start + samples_per_window
|
90 |
+
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
|
91 |
+
sample_rate=sampling_rate))
|
92 |
+
voice_flags = np.array(voice_flags)
|
93 |
+
|
94 |
+
# Smooth the voice detection with a moving average
|
95 |
+
def moving_average(array, width):
|
96 |
+
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
|
97 |
+
ret = np.cumsum(array_padded, dtype=float)
|
98 |
+
ret[width:] = ret[width:] - ret[:-width]
|
99 |
+
return ret[width - 1:] / width
|
100 |
+
|
101 |
+
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
102 |
+
audio_mask = np.round(audio_mask).astype(np.bool)
|
103 |
+
|
104 |
+
# Dilate the voiced regions
|
105 |
+
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
106 |
+
audio_mask = np.repeat(audio_mask, samples_per_window)
|
107 |
+
|
108 |
+
return wav[audio_mask == True]
|
109 |
+
|
110 |
+
|
111 |
+
def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
|
112 |
+
if increase_only and decrease_only:
|
113 |
+
raise ValueError("Both increase only and decrease only are set")
|
114 |
+
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
|
115 |
+
if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
|
116 |
+
return wav
|
117 |
+
return wav * (10 ** (dBFS_change / 20))
|