Spaces:
Running
on
Zero
Running
on
Zero
added torchcrepe
Browse files- app.py +21 -50
- requirements.txt +2 -4
app.py
CHANGED
@@ -30,8 +30,7 @@ import torchaudio
|
|
30 |
from absl import app
|
31 |
from torch.nn.functional import interpolate
|
32 |
import logging
|
33 |
-
import
|
34 |
-
from hmmlearn import hmm
|
35 |
import soundfile as sf
|
36 |
import pdb
|
37 |
from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
|
@@ -54,54 +53,23 @@ def debug_profile(func):
|
|
54 |
return pp.profile(sort_by='cumulative', out_lines=10)(func)
|
55 |
return func
|
56 |
|
57 |
-
def predict_voicing(confidence):
|
58 |
-
# https://github.com/marl/crepe/pull/26
|
59 |
-
"""
|
60 |
-
Find the Viterbi path for voiced versus unvoiced frames.
|
61 |
-
Parameters
|
62 |
-
----------
|
63 |
-
confidence : np.ndarray [shape=(N,)]
|
64 |
-
voicing confidence array, i.e. the confidence in the presence of
|
65 |
-
a pitch
|
66 |
-
Returns
|
67 |
-
-------
|
68 |
-
voicing_states : np.ndarray [shape=(N,)]
|
69 |
-
HMM predictions for each frames state, 0 if unvoiced, 1 if
|
70 |
-
voiced
|
71 |
-
"""
|
72 |
-
# uniform prior on the voicing confidence
|
73 |
-
starting = np.array([0.5, 0.5])
|
74 |
-
|
75 |
-
# transition probabilities inducing continuous voicing state
|
76 |
-
transition = np.array([[0.99, 0.01], [0.01, 0.99]])
|
77 |
-
|
78 |
-
# mean and variance for unvoiced and voiced states
|
79 |
-
means = np.array([[0.0], [1.0]])
|
80 |
-
variances = np.array([[0.25], [0.25]])
|
81 |
-
|
82 |
-
# fix the model parameters because we are not optimizing the model
|
83 |
-
model = hmm.GaussianHMM(n_components=2)
|
84 |
-
model.startprob_, model.covars_, model.transmat_, model.means_, \
|
85 |
-
model.n_features = starting, variances, transition, means, 1
|
86 |
-
|
87 |
-
# find the Viterbi path
|
88 |
-
voicing_states = model.predict(confidence.reshape(-1, 1), [len(confidence)])
|
89 |
-
|
90 |
-
return np.array(voicing_states)
|
91 |
|
92 |
def extract_pitch(audio, unvoice=True, sr=16000, frame_shift_ms=10, log=True):
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
f0 =
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
103 |
|
104 |
-
return
|
105 |
|
106 |
def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, noise_std=0.4, t0=0.5):
|
107 |
'''Generate pitch values for the melodic reinterpretation task'''
|
@@ -219,12 +187,15 @@ def container_generate(model_selection, task_selection, audio, singer_id, t0):
|
|
219 |
# make sure the audio is at least 4 s long
|
220 |
audio = np.pad(audio, (4*sr - len(audio), 0), mode='constant')
|
221 |
audio = audio.astype(np.float32)
|
222 |
-
audio /= np.max(np.abs(audio))
|
223 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
|
224 |
mic_audio = audio.copy()
|
225 |
audio = audio[-12*16000:] # consider only last 12 s
|
226 |
-
|
227 |
-
|
|
|
|
|
|
|
228 |
logging.log(logging.INFO, 'Pitch extracted')
|
229 |
f0 = pitch_task_fn(**{
|
230 |
'inputs': {
|
|
|
30 |
from absl import app
|
31 |
from torch.nn.functional import interpolate
|
32 |
import logging
|
33 |
+
import torchcrepe
|
|
|
34 |
import soundfile as sf
|
35 |
import pdb
|
36 |
from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
|
|
|
53 |
return pp.profile(sort_by='cumulative', out_lines=10)(func)
|
54 |
return func
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
def extract_pitch(audio, unvoice=True, sr=16000, frame_shift_ms=10, log=True):
|
58 |
+
if not isinstance(audio, torch.Tensor):
|
59 |
+
audio = torch.Tensor(audio).to(device)
|
60 |
+
if len(audio.shape) == 1:
|
61 |
+
audio = audio.unsqueeze(0)
|
62 |
+
hop_length = int(sr * frame_shift_ms / 1000)
|
63 |
+
f0 = torchcrepe.predict(audio,
|
64 |
+
sr,
|
65 |
+
hop_length=hop_length,
|
66 |
+
model='tiny',
|
67 |
+
device=device,
|
68 |
+
fmin=80,
|
69 |
+
fmax=800
|
70 |
+
)
|
71 |
|
72 |
+
return f0.squeeze(0)
|
73 |
|
74 |
def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, noise_std=0.4, t0=0.5):
|
75 |
'''Generate pitch values for the melodic reinterpretation task'''
|
|
|
187 |
# make sure the audio is at least 4 s long
|
188 |
audio = np.pad(audio, (4*sr - len(audio), 0), mode='constant')
|
189 |
audio = audio.astype(np.float32)
|
190 |
+
audio /= np.max(np.abs(audio) + np.finfo(float).eps) # normalize audio
|
191 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
|
192 |
mic_audio = audio.copy()
|
193 |
audio = audio[-12*16000:] # consider only last 12 s
|
194 |
+
f0 = extract_pitch(audio)
|
195 |
+
# move f0 to cpu
|
196 |
+
if f0.device != 'cpu': #TODO:
|
197 |
+
f0 = f0.cpu()
|
198 |
+
mic_f0 = f0.clone() # save the user input pitch values
|
199 |
logging.log(logging.INFO, 'Pitch extracted')
|
200 |
f0 = pitch_task_fn(**{
|
201 |
'inputs': {
|
requirements.txt
CHANGED
@@ -1,4 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
tensorflow==2.17.0
|
4 |
-
GaMaDHaNi @ git+https://github.com/snnithya/GaMaDHaNi.git@055df71380e0feced7e409470ffc8603f1cfa926
|
|
|
1 |
+
torchcrepe==0.0.23
|
2 |
+
GaMaDHaNi @ git+https://github.com/snnithya/GaMaDHaNi.git@8f781e6a580bf2db794bcc813913a2a5e9efde99
|
|
|
|