snnithya commited on
Commit
4fc6f5b
1 Parent(s): 7347979

added torchcrepe

Browse files
Files changed (2) hide show
  1. app.py +21 -50
  2. requirements.txt +2 -4
app.py CHANGED
@@ -30,8 +30,7 @@ import torchaudio
30
  from absl import app
31
  from torch.nn.functional import interpolate
32
  import logging
33
- import crepe
34
- from hmmlearn import hmm
35
  import soundfile as sf
36
  import pdb
37
  from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
@@ -54,54 +53,23 @@ def debug_profile(func):
54
  return pp.profile(sort_by='cumulative', out_lines=10)(func)
55
  return func
56
 
57
- def predict_voicing(confidence):
58
- # https://github.com/marl/crepe/pull/26
59
- """
60
- Find the Viterbi path for voiced versus unvoiced frames.
61
- Parameters
62
- ----------
63
- confidence : np.ndarray [shape=(N,)]
64
- voicing confidence array, i.e. the confidence in the presence of
65
- a pitch
66
- Returns
67
- -------
68
- voicing_states : np.ndarray [shape=(N,)]
69
- HMM predictions for each frames state, 0 if unvoiced, 1 if
70
- voiced
71
- """
72
- # uniform prior on the voicing confidence
73
- starting = np.array([0.5, 0.5])
74
-
75
- # transition probabilities inducing continuous voicing state
76
- transition = np.array([[0.99, 0.01], [0.01, 0.99]])
77
-
78
- # mean and variance for unvoiced and voiced states
79
- means = np.array([[0.0], [1.0]])
80
- variances = np.array([[0.25], [0.25]])
81
-
82
- # fix the model parameters because we are not optimizing the model
83
- model = hmm.GaussianHMM(n_components=2)
84
- model.startprob_, model.covars_, model.transmat_, model.means_, \
85
- model.n_features = starting, variances, transition, means, 1
86
-
87
- # find the Viterbi path
88
- voicing_states = model.predict(confidence.reshape(-1, 1), [len(confidence)])
89
-
90
- return np.array(voicing_states)
91
 
92
  def extract_pitch(audio, unvoice=True, sr=16000, frame_shift_ms=10, log=True):
93
- time, frequency, confidence, _ = crepe.predict(
94
- audio, sr=sr,
95
- viterbi=True,
96
- step_size=frame_shift_ms,
97
- verbose=0 if not log else 1)
98
- f0 = frequency
99
- if unvoice:
100
- is_voiced = predict_voicing(confidence)
101
- frequency_unvoiced = frequency * is_voiced
102
- f0 = frequency_unvoiced
 
 
 
103
 
104
- return time, f0, confidence
105
 
106
  def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, noise_std=0.4, t0=0.5):
107
  '''Generate pitch values for the melodic reinterpretation task'''
@@ -219,12 +187,15 @@ def container_generate(model_selection, task_selection, audio, singer_id, t0):
219
  # make sure the audio is at least 4 s long
220
  audio = np.pad(audio, (4*sr - len(audio), 0), mode='constant')
221
  audio = audio.astype(np.float32)
222
- audio /= np.max(np.abs(audio))
223
  audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
224
  mic_audio = audio.copy()
225
  audio = audio[-12*16000:] # consider only last 12 s
226
- _, f0, _ = extract_pitch(audio)
227
- mic_f0 = f0.copy() # save the user input pitch values
 
 
 
228
  logging.log(logging.INFO, 'Pitch extracted')
229
  f0 = pitch_task_fn(**{
230
  'inputs': {
 
30
  from absl import app
31
  from torch.nn.functional import interpolate
32
  import logging
33
+ import torchcrepe
 
34
  import soundfile as sf
35
  import pdb
36
  from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
 
53
  return pp.profile(sort_by='cumulative', out_lines=10)(func)
54
  return func
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  def extract_pitch(audio, unvoice=True, sr=16000, frame_shift_ms=10, log=True):
58
+ if not isinstance(audio, torch.Tensor):
59
+ audio = torch.Tensor(audio).to(device)
60
+ if len(audio.shape) == 1:
61
+ audio = audio.unsqueeze(0)
62
+ hop_length = int(sr * frame_shift_ms / 1000)
63
+ f0 = torchcrepe.predict(audio,
64
+ sr,
65
+ hop_length=hop_length,
66
+ model='tiny',
67
+ device=device,
68
+ fmin=80,
69
+ fmax=800
70
+ )
71
 
72
+ return f0.squeeze(0)
73
 
74
  def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, noise_std=0.4, t0=0.5):
75
  '''Generate pitch values for the melodic reinterpretation task'''
 
187
  # make sure the audio is at least 4 s long
188
  audio = np.pad(audio, (4*sr - len(audio), 0), mode='constant')
189
  audio = audio.astype(np.float32)
190
+ audio /= np.max(np.abs(audio) + np.finfo(float).eps) # normalize audio
191
  audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
192
  mic_audio = audio.copy()
193
  audio = audio[-12*16000:] # consider only last 12 s
194
+ f0 = extract_pitch(audio)
195
+ # move f0 to cpu
196
+ if f0.device != 'cpu': #TODO:
197
+ f0 = f0.cpu()
198
+ mic_f0 = f0.clone() # save the user input pitch values
199
  logging.log(logging.INFO, 'Pitch extracted')
200
  f0 = pitch_task_fn(**{
201
  'inputs': {
requirements.txt CHANGED
@@ -1,4 +1,2 @@
1
- crepe==0.0.15
2
- hmmlearn==0.3.2
3
- tensorflow==2.17.0
4
- GaMaDHaNi @ git+https://github.com/snnithya/GaMaDHaNi.git@055df71380e0feced7e409470ffc8603f1cfa926
 
1
+ torchcrepe==0.0.23
2
+ GaMaDHaNi @ git+https://github.com/snnithya/GaMaDHaNi.git@8f781e6a580bf2db794bcc813913a2a5e9efde99