megaaziib commited on
Commit
e6b56b1
1 Parent(s): 0e2a7fc

added rmvpe infer pipelines

Browse files
Files changed (1) hide show
  1. vc_infer_pipeline.py +234 -19
vc_infer_pipeline.py CHANGED
@@ -1,11 +1,16 @@
1
- import numpy as np, parselmouth, torch, pdb
2
  from time import time as ttime
3
  import torch.nn.functional as F
 
 
4
  import scipy.signal as signal
5
  import pyworld, os, traceback, faiss, librosa, torchcrepe
6
  from scipy import signal
7
  from functools import lru_cache
8
 
 
 
 
9
  bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
10
 
11
  input_audio_path2wav = {}
@@ -66,6 +71,186 @@ class VC(object):
66
  self.t_max = self.sr * self.x_max # 免查询时长阈值
67
  self.device = config.device
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def get_f0(
70
  self,
71
  input_audio_path,
@@ -74,6 +259,7 @@ class VC(object):
74
  f0_up_key,
75
  f0_method,
76
  filter_radius,
 
77
  inp_f0=None,
78
  ):
79
  global input_audio_path2wav
@@ -103,27 +289,53 @@ class VC(object):
103
  f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
104
  if filter_radius > 2:
105
  f0 = signal.medfilt(f0, 3)
 
 
 
 
 
 
 
 
 
 
106
  elif f0_method == "crepe":
107
- model = "full"
108
- # Pick a batch size that doesn't cause memory errors on your gpu
109
- batch_size = 512
110
- # Compute pitch using first gpu
111
- audio = torch.tensor(np.copy(x))[None].float()
112
- f0, pd = torchcrepe.predict(
113
- audio,
114
- self.sr,
115
- self.window,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  f0_min,
117
  f0_max,
118
- model,
119
- batch_size=batch_size,
120
- device=self.device,
121
- return_periodicity=True,
122
  )
123
- pd = torchcrepe.filter.median(pd, 3)
124
- f0 = torchcrepe.filter.mean(f0, 3)
125
- f0[pd < 0.1] = 0
126
- f0 = f0[0].cpu().numpy()
127
  f0 *= pow(2, f0_up_key / 12)
128
  # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
129
  tf0 = self.sr // self.window # 每秒f0点数
@@ -147,6 +359,7 @@ class VC(object):
147
  f0_mel[f0_mel <= 1] = 1
148
  f0_mel[f0_mel > 255] = 255
149
  f0_coarse = np.rint(f0_mel).astype(np.int)
 
150
  return f0_coarse, f0bak # 1-0
151
 
152
  def vc(
@@ -271,6 +484,7 @@ class VC(object):
271
  rms_mix_rate,
272
  version,
273
  protect,
 
274
  f0_file=None,
275
  ):
276
  if (
@@ -332,6 +546,7 @@ class VC(object):
332
  f0_up_key,
333
  f0_method,
334
  filter_radius,
 
335
  inp_f0,
336
  )
337
  pitch = pitch[:p_len]
@@ -428,4 +643,4 @@ class VC(object):
428
  del pitch, pitchf, sid
429
  if torch.cuda.is_available():
430
  torch.cuda.empty_cache()
431
- return audio_opt
 
1
+ import numpy as np, parselmouth, torch, pdb, sys, os
2
  from time import time as ttime
3
  import torch.nn.functional as F
4
+ import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
5
+ from torch import Tensor
6
  import scipy.signal as signal
7
  import pyworld, os, traceback, faiss, librosa, torchcrepe
8
  from scipy import signal
9
  from functools import lru_cache
10
 
11
+ now_dir = os.getcwd()
12
+ sys.path.append(now_dir)
13
+
14
  bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
15
 
16
  input_audio_path2wav = {}
 
71
  self.t_max = self.sr * self.x_max # 免查询时长阈值
72
  self.device = config.device
73
 
74
+ # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
75
+ def get_optimal_torch_device(self, index: int = 0) -> torch.device:
76
+ # Get cuda device
77
+ if torch.cuda.is_available():
78
+ return torch.device(
79
+ f"cuda:{index % torch.cuda.device_count()}"
80
+ ) # Very fast
81
+ elif torch.backends.mps.is_available():
82
+ return torch.device("mps")
83
+ # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
84
+ # Else wise return the "cpu" as a torch device,
85
+ return torch.device("cpu")
86
+
87
+ # Fork Feature: Compute f0 with the crepe method
88
+ def get_f0_crepe_computation(
89
+ self,
90
+ x,
91
+ f0_min,
92
+ f0_max,
93
+ p_len,
94
+ hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
95
+ model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
96
+ ):
97
+ x = x.astype(
98
+ np.float32
99
+ ) # fixes the F.conv2D exception. We needed to convert double to float.
100
+ x /= np.quantile(np.abs(x), 0.999)
101
+ torch_device = self.get_optimal_torch_device()
102
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
103
+ audio = torch.unsqueeze(audio, dim=0)
104
+ if audio.ndim == 2 and audio.shape[0] > 1:
105
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
106
+ audio = audio.detach()
107
+ print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
108
+ pitch: Tensor = torchcrepe.predict(
109
+ audio,
110
+ self.sr,
111
+ hop_length,
112
+ f0_min,
113
+ f0_max,
114
+ model,
115
+ batch_size=hop_length * 2,
116
+ device=torch_device,
117
+ pad=True,
118
+ )
119
+ p_len = p_len or x.shape[0] // hop_length
120
+ # Resize the pitch for final f0
121
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
122
+ source[source < 0.001] = np.nan
123
+ target = np.interp(
124
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
125
+ np.arange(0, len(source)),
126
+ source,
127
+ )
128
+ f0 = np.nan_to_num(target)
129
+ return f0 # Resized f0
130
+
131
+ def get_f0_official_crepe_computation(
132
+ self,
133
+ x,
134
+ f0_min,
135
+ f0_max,
136
+ model="full",
137
+ ):
138
+ # Pick a batch size that doesn't cause memory errors on your gpu
139
+ batch_size = 512
140
+ # Compute pitch using first gpu
141
+ audio = torch.tensor(np.copy(x))[None].float()
142
+ f0, pd = torchcrepe.predict(
143
+ audio,
144
+ self.sr,
145
+ self.window,
146
+ f0_min,
147
+ f0_max,
148
+ model,
149
+ batch_size=batch_size,
150
+ device=self.device,
151
+ return_periodicity=True,
152
+ )
153
+ pd = torchcrepe.filter.median(pd, 3)
154
+ f0 = torchcrepe.filter.mean(f0, 3)
155
+ f0[pd < 0.1] = 0
156
+ f0 = f0[0].cpu().numpy()
157
+ return f0
158
+
159
+ # Fork Feature: Compute pYIN f0 method
160
+ def get_f0_pyin_computation(self, x, f0_min, f0_max):
161
+ y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
162
+ f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
163
+ f0 = f0[1:] # Get rid of extra first frame
164
+ return f0
165
+
166
+ # Fork Feature: Acquire median hybrid f0 estimation calculation
167
+ def get_f0_hybrid_computation(
168
+ self,
169
+ methods_str,
170
+ input_audio_path,
171
+ x,
172
+ f0_min,
173
+ f0_max,
174
+ p_len,
175
+ filter_radius,
176
+ crepe_hop_length,
177
+ time_step,
178
+ ):
179
+ # Get various f0 methods from input to use in the computation stack
180
+ s = methods_str
181
+ s = s.split("hybrid")[1]
182
+ s = s.replace("[", "").replace("]", "")
183
+ methods = s.split("+")
184
+ f0_computation_stack = []
185
+
186
+ print("Calculating f0 pitch estimations for methods: %s" % str(methods))
187
+ x = x.astype(np.float32)
188
+ x /= np.quantile(np.abs(x), 0.999)
189
+ # Get f0 calculations for all methods specified
190
+ for method in methods:
191
+ f0 = None
192
+ if method == "pm":
193
+ f0 = (
194
+ parselmouth.Sound(x, self.sr)
195
+ .to_pitch_ac(
196
+ time_step=time_step / 1000,
197
+ voicing_threshold=0.6,
198
+ pitch_floor=f0_min,
199
+ pitch_ceiling=f0_max,
200
+ )
201
+ .selected_array["frequency"]
202
+ )
203
+ pad_size = (p_len - len(f0) + 1) // 2
204
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
205
+ f0 = np.pad(
206
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
207
+ )
208
+ elif method == "crepe":
209
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
210
+ f0 = f0[1:] # Get rid of extra first frame
211
+ elif method == "crepe-tiny":
212
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
213
+ f0 = f0[1:] # Get rid of extra first frame
214
+ elif method == "mangio-crepe":
215
+ f0 = self.get_f0_crepe_computation(
216
+ x, f0_min, f0_max, p_len, crepe_hop_length
217
+ )
218
+ elif method == "mangio-crepe-tiny":
219
+ f0 = self.get_f0_crepe_computation(
220
+ x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
221
+ )
222
+ elif method == "harvest":
223
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
224
+ if filter_radius > 2:
225
+ f0 = signal.medfilt(f0, 3)
226
+ f0 = f0[1:] # Get rid of first frame.
227
+ elif method == "dio": # Potentially buggy?
228
+ f0, t = pyworld.dio(
229
+ x.astype(np.double),
230
+ fs=self.sr,
231
+ f0_ceil=f0_max,
232
+ f0_floor=f0_min,
233
+ frame_period=10,
234
+ )
235
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
236
+ f0 = signal.medfilt(f0, 3)
237
+ f0 = f0[1:]
238
+ # elif method == "pyin": Not Working just yet
239
+ # f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
240
+ # Push method to the stack
241
+ f0_computation_stack.append(f0)
242
+
243
+ for fc in f0_computation_stack:
244
+ print(len(fc))
245
+
246
+ print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
247
+ f0_median_hybrid = None
248
+ if len(f0_computation_stack) == 1:
249
+ f0_median_hybrid = f0_computation_stack[0]
250
+ else:
251
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
252
+ return f0_median_hybrid
253
+
254
  def get_f0(
255
  self,
256
  input_audio_path,
 
259
  f0_up_key,
260
  f0_method,
261
  filter_radius,
262
+ crepe_hop_length,
263
  inp_f0=None,
264
  ):
265
  global input_audio_path2wav
 
289
  f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
290
  if filter_radius > 2:
291
  f0 = signal.medfilt(f0, 3)
292
+ elif f0_method == "dio": # Potentially Buggy?
293
+ f0, t = pyworld.dio(
294
+ x.astype(np.double),
295
+ fs=self.sr,
296
+ f0_ceil=f0_max,
297
+ f0_floor=f0_min,
298
+ frame_period=10,
299
+ )
300
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
301
+ f0 = signal.medfilt(f0, 3)
302
  elif f0_method == "crepe":
303
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
304
+ elif f0_method == "crepe-tiny":
305
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
306
+ elif f0_method == "mangio-crepe":
307
+ f0 = self.get_f0_crepe_computation(
308
+ x, f0_min, f0_max, p_len, crepe_hop_length
309
+ )
310
+ elif f0_method == "mangio-crepe-tiny":
311
+ f0 = self.get_f0_crepe_computation(
312
+ x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
313
+ )
314
+ elif f0_method == "rmvpe":
315
+ if hasattr(self, "model_rmvpe") == False:
316
+ from rmvpe import RMVPE
317
+
318
+ print("loading rmvpe model")
319
+ self.model_rmvpe = RMVPE(
320
+ "rmvpe.pt", is_half=self.is_half, device=self.device
321
+ )
322
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
323
+
324
+ elif "hybrid" in f0_method:
325
+ # Perform hybrid median pitch estimation
326
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
327
+ f0 = self.get_f0_hybrid_computation(
328
+ f0_method,
329
+ input_audio_path,
330
+ x,
331
  f0_min,
332
  f0_max,
333
+ p_len,
334
+ filter_radius,
335
+ crepe_hop_length,
336
+ time_step,
337
  )
338
+
 
 
 
339
  f0 *= pow(2, f0_up_key / 12)
340
  # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
341
  tf0 = self.sr // self.window # 每秒f0点数
 
359
  f0_mel[f0_mel <= 1] = 1
360
  f0_mel[f0_mel > 255] = 255
361
  f0_coarse = np.rint(f0_mel).astype(np.int)
362
+
363
  return f0_coarse, f0bak # 1-0
364
 
365
  def vc(
 
484
  rms_mix_rate,
485
  version,
486
  protect,
487
+ crepe_hop_length,
488
  f0_file=None,
489
  ):
490
  if (
 
546
  f0_up_key,
547
  f0_method,
548
  filter_radius,
549
+ crepe_hop_length,
550
  inp_f0,
551
  )
552
  pitch = pitch[:p_len]
 
643
  del pitch, pitchf, sid
644
  if torch.cuda.is_available():
645
  torch.cuda.empty_cache()
646
+ return audio_opt