Next commited on
Commit
96eb9b3
1 Parent(s): 690504e

Create pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +770 -0
pipeline.py ADDED
@@ -0,0 +1,770 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import re
4
+ import sys
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import parselmouth
8
+ import torchcrepe
9
+ import pyworld
10
+ import faiss
11
+ import librosa
12
+ import numpy as np
13
+ from scipy import signal
14
+ from functools import lru_cache
15
+ from torch import Tensor
16
+
17
+ now_dir = os.getcwd()
18
+ sys.path.append(now_dir)
19
+ from rvc.lib.predictors.RMVPE import RMVPE0Predictor
20
+ from rvc.lib.predictors.FCPE import FCPEF0Predictor
21
+
22
+
23
+ # Constants for high-pass filter
24
+ FILTER_ORDER = 5
25
+ CUTOFF_FREQUENCY = 48 # Hz
26
+ SAMPLE_RATE = 16000 # Hz
27
+ bh, ah = signal.butter(
28
+ N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE
29
+ )
30
+
31
+ input_audio_path2wav = {}
32
+
33
+
34
+ class AudioProcessor:
35
+ """
36
+ A class for processing audio signals, specifically for adjusting RMS levels.
37
+ """
38
+
39
+ def change_rms(
40
+ source_audio: np.ndarray,
41
+ source_rate: int,
42
+ target_audio: np.ndarray,
43
+ target_rate: int,
44
+ rate: float,
45
+ ) -> np.ndarray:
46
+ """
47
+ Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate.
48
+
49
+ Args:
50
+ source_audio: The source audio signal as a NumPy array.
51
+ source_rate: The sampling rate of the source audio.
52
+ target_audio: The target audio signal to adjust.
53
+ target_rate: The sampling rate of the target audio.
54
+ rate: The blending rate between the source and target RMS levels.
55
+
56
+ Returns:
57
+ The adjusted target audio signal with RMS level modified to match the source audio.
58
+ """
59
+ # Calculate RMS of both audio data
60
+ rms1 = librosa.feature.rms(
61
+ y=source_audio,
62
+ frame_length=source_rate // 2 * 2,
63
+ hop_length=source_rate // 2,
64
+ )
65
+ rms2 = librosa.feature.rms(
66
+ y=target_audio,
67
+ frame_length=target_rate // 2 * 2,
68
+ hop_length=target_rate // 2,
69
+ )
70
+
71
+ # Interpolate RMS to match target audio length
72
+ rms1 = F.interpolate(
73
+ torch.from_numpy(rms1).float().unsqueeze(0),
74
+ size=target_audio.shape[0],
75
+ mode="linear",
76
+ ).squeeze()
77
+ rms2 = F.interpolate(
78
+ torch.from_numpy(rms2).float().unsqueeze(0),
79
+ size=target_audio.shape[0],
80
+ mode="linear",
81
+ ).squeeze()
82
+ rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6)
83
+
84
+ # Adjust target audio RMS based on the source audio RMS
85
+ adjusted_audio = (
86
+ target_audio
87
+ * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy()
88
+ )
89
+ return adjusted_audio
90
+
91
+
92
+ class Autotune:
93
+ """
94
+ A class for applying autotune to a given fundamental frequency (F0) contour.
95
+ """
96
+
97
+ def __init__(self, ref_freqs):
98
+ """
99
+ Initializes the Autotune class with a set of reference frequencies.
100
+
101
+ Args:
102
+ ref_freqs: A list of reference frequencies representing musical notes.
103
+ """
104
+ self.ref_freqs = ref_freqs
105
+ self.note_dict = self.generate_interpolated_frequencies()
106
+
107
+ def generate_interpolated_frequencies(self):
108
+ """
109
+ Generates a dictionary of interpolated frequencies between reference frequencies.
110
+
111
+ Returns:
112
+ A list of interpolated frequencies, including the original reference frequencies.
113
+ """
114
+ note_dict = []
115
+ for i in range(len(self.ref_freqs) - 1):
116
+ freq_low = self.ref_freqs[i]
117
+ freq_high = self.ref_freqs[i + 1]
118
+ interpolated_freqs = np.linspace(
119
+ freq_low, freq_high, num=10, endpoint=False
120
+ )
121
+ note_dict.extend(interpolated_freqs)
122
+ note_dict.append(self.ref_freqs[-1])
123
+ return note_dict
124
+
125
+ def autotune_f0(self, f0):
126
+ """
127
+ Autotunes a given F0 contour by snapping each frequency to the closest reference frequency.
128
+
129
+ Args:
130
+ f0: The input F0 contour as a NumPy array.
131
+
132
+ Returns:
133
+ The autotuned F0 contour.
134
+ """
135
+ autotuned_f0 = np.zeros_like(f0)
136
+ for i, freq in enumerate(f0):
137
+ closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
138
+ autotuned_f0[i] = closest_note
139
+ return autotuned_f0
140
+
141
+
142
+ class Pipeline:
143
+ """
144
+ The main pipeline class for performing voice conversion, including preprocessing, F0 estimation,
145
+ voice conversion using a model, and post-processing.
146
+ """
147
+
148
+ def __init__(self, tgt_sr, config):
149
+ """
150
+ Initializes the Pipeline class with target sampling rate and configuration parameters.
151
+
152
+ Args:
153
+ tgt_sr: The target sampling rate for the output audio.
154
+ config: A configuration object containing various parameters for the pipeline.
155
+ """
156
+ self.x_pad = config.x_pad
157
+ self.x_query = config.x_query
158
+ self.x_center = config.x_center
159
+ self.x_max = config.x_max
160
+ self.is_half = config.is_half
161
+ self.sample_rate = 16000
162
+ self.window = 160
163
+ self.t_pad = self.sample_rate * self.x_pad
164
+ self.t_pad_tgt = tgt_sr * self.x_pad
165
+ self.t_pad2 = self.t_pad * 2
166
+ self.t_query = self.sample_rate * self.x_query
167
+ self.t_center = self.sample_rate * self.x_center
168
+ self.t_max = self.sample_rate * self.x_max
169
+ self.time_step = self.window / self.sample_rate * 1000
170
+ self.f0_min = 50
171
+ self.f0_max = 1100
172
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
173
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
174
+ self.device = config.device
175
+ self.ref_freqs = [
176
+ 65.41,
177
+ 82.41,
178
+ 110.00,
179
+ 146.83,
180
+ 196.00,
181
+ 246.94,
182
+ 329.63,
183
+ 440.00,
184
+ 587.33,
185
+ 783.99,
186
+ 1046.50,
187
+ ]
188
+ self.autotune = Autotune(self.ref_freqs)
189
+ self.note_dict = self.autotune.note_dict
190
+
191
+ @staticmethod
192
+ @lru_cache
193
+ def get_f0_harvest(input_audio_path, fs, f0max, f0min, frame_period):
194
+ """
195
+ Estimates the fundamental frequency (F0) of a given audio file using the Harvest algorithm.
196
+
197
+ Args:
198
+ input_audio_path: Path to the input audio file.
199
+ fs: Sampling rate of the audio file.
200
+ f0max: Maximum F0 value to consider.
201
+ f0min: Minimum F0 value to consider.
202
+ frame_period: Frame period in milliseconds for F0 analysis.
203
+
204
+ Returns:
205
+ The estimated F0 contour as a NumPy array.
206
+ """
207
+ audio = input_audio_path2wav[input_audio_path]
208
+ f0, t = pyworld.harvest(
209
+ audio,
210
+ fs=fs,
211
+ f0_ceil=f0max,
212
+ f0_floor=f0min,
213
+ frame_period=frame_period,
214
+ )
215
+ f0 = pyworld.stonemask(audio, f0, t, fs)
216
+ return f0
217
+
218
+ def get_f0_crepe(
219
+ self,
220
+ x,
221
+ f0_min,
222
+ f0_max,
223
+ p_len,
224
+ hop_length,
225
+ model="full",
226
+ ):
227
+ """
228
+ Estimates the fundamental frequency (F0) of a given audio signal using the Crepe model.
229
+
230
+ Args:
231
+ x: The input audio signal as a NumPy array.
232
+ f0_min: Minimum F0 value to consider.
233
+ f0_max: Maximum F0 value to consider.
234
+ p_len: Desired length of the F0 output.
235
+ hop_length: Hop length for the Crepe model.
236
+ model: Crepe model size to use ("full" or "tiny").
237
+
238
+ Returns:
239
+ The estimated F0 contour as a NumPy array.
240
+ """
241
+ x = x.astype(np.float32)
242
+ x /= np.quantile(np.abs(x), 0.999)
243
+ audio = torch.from_numpy(x).to(self.device, copy=True)
244
+ audio = torch.unsqueeze(audio, dim=0)
245
+ if audio.ndim == 2 and audio.shape[0] > 1:
246
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
247
+ audio = audio.detach()
248
+ pitch: Tensor = torchcrepe.predict(
249
+ audio,
250
+ self.sample_rate,
251
+ hop_length,
252
+ f0_min,
253
+ f0_max,
254
+ model,
255
+ batch_size=hop_length * 2,
256
+ device=self.device,
257
+ pad=True,
258
+ )
259
+ p_len = p_len or x.shape[0] // hop_length
260
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
261
+ source[source < 0.001] = np.nan
262
+ target = np.interp(
263
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
264
+ np.arange(0, len(source)),
265
+ source,
266
+ )
267
+ f0 = np.nan_to_num(target)
268
+ return f0
269
+
270
+ def get_f0_hybrid(
271
+ self,
272
+ methods_str,
273
+ x,
274
+ f0_min,
275
+ f0_max,
276
+ p_len,
277
+ hop_length,
278
+ ):
279
+ """
280
+ Estimates the fundamental frequency (F0) using a hybrid approach combining multiple methods.
281
+
282
+ Args:
283
+ methods_str: A string specifying the methods to combine (e.g., "hybrid[crepe+rmvpe]").
284
+ x: The input audio signal as a NumPy array.
285
+ f0_min: Minimum F0 value to consider.
286
+ f0_max: Maximum F0 value to consider.
287
+ p_len: Desired length of the F0 output.
288
+ hop_length: Hop length for F0 estimation methods.
289
+
290
+ Returns:
291
+ The estimated F0 contour as a NumPy array, obtained by combining the specified methods.
292
+ """
293
+ methods_str = re.search("hybrid\[(.+)\]", methods_str)
294
+ if methods_str:
295
+ methods = [method.strip() for method in methods_str.group(1).split("+")]
296
+ f0_computation_stack = []
297
+ print(f"Calculating f0 pitch estimations for methods {str(methods)}")
298
+ x = x.astype(np.float32)
299
+ x /= np.quantile(np.abs(x), 0.999)
300
+ for method in methods:
301
+ f0 = None
302
+ if method == "crepe":
303
+ f0 = self.get_f0_crepe_computation(
304
+ x, f0_min, f0_max, p_len, int(hop_length)
305
+ )
306
+ elif method == "rmvpe":
307
+ self.model_rmvpe = RMVPE0Predictor(
308
+ os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
309
+ is_half=self.is_half,
310
+ device=self.device,
311
+ )
312
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
313
+ f0 = f0[1:]
314
+ elif method == "fcpe":
315
+ self.model_fcpe = FCPEF0Predictor(
316
+ os.path.join("rvc", "models", "predictors", "fcpe.pt"),
317
+ f0_min=int(f0_min),
318
+ f0_max=int(f0_max),
319
+ dtype=torch.float32,
320
+ device=self.device,
321
+ sampling_rate=self.sample_rate,
322
+ threshold=0.03,
323
+ )
324
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
325
+ del self.model_fcpe
326
+ gc.collect()
327
+ f0_computation_stack.append(f0)
328
+
329
+ f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None]
330
+ f0_median_hybrid = None
331
+ if len(f0_computation_stack) == 1:
332
+ f0_median_hybrid = f0_computation_stack[0]
333
+ else:
334
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
335
+ return f0_median_hybrid
336
+
337
+ def get_f0(
338
+ self,
339
+ input_audio_path,
340
+ x,
341
+ p_len,
342
+ f0_up_key,
343
+ f0_method,
344
+ filter_radius,
345
+ hop_length,
346
+ f0_autotune,
347
+ inp_f0=None,
348
+ ):
349
+ """
350
+ Estimates the fundamental frequency (F0) of a given audio signal using various methods.
351
+
352
+ Args:
353
+ input_audio_path: Path to the input audio file.
354
+ x: The input audio signal as a NumPy array.
355
+ p_len: Desired length of the F0 output.
356
+ f0_up_key: Key to adjust the pitch of the F0 contour.
357
+ f0_method: Method to use for F0 estimation (e.g., "pm", "harvest", "crepe").
358
+ filter_radius: Radius for median filtering the F0 contour.
359
+ hop_length: Hop length for F0 estimation methods.
360
+ f0_autotune: Whether to apply autotune to the F0 contour.
361
+ inp_f0: Optional input F0 contour to use instead of estimating.
362
+
363
+ Returns:
364
+ A tuple containing the quantized F0 contour and the original F0 contour.
365
+ """
366
+ global input_audio_path2wav
367
+ if f0_method == "pm":
368
+ f0 = (
369
+ parselmouth.Sound(x, self.sample_rate)
370
+ .to_pitch_ac(
371
+ time_step=self.time_step / 1000,
372
+ voicing_threshold=0.6,
373
+ pitch_floor=self.f0_min,
374
+ pitch_ceiling=self.f0_max,
375
+ )
376
+ .selected_array["frequency"]
377
+ )
378
+ pad_size = (p_len - len(f0) + 1) // 2
379
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
380
+ f0 = np.pad(
381
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
382
+ )
383
+ elif f0_method == "harvest":
384
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
385
+ f0 = self.get_f0_harvest(
386
+ input_audio_path, self.sample_rate, self.f0_max, self.f0_min, 10
387
+ )
388
+ if int(filter_radius) > 2:
389
+ f0 = signal.medfilt(f0, 3)
390
+ elif f0_method == "dio":
391
+ f0, t = pyworld.dio(
392
+ x.astype(np.double),
393
+ fs=self.sample_rate,
394
+ f0_ceil=self.f0_max,
395
+ f0_floor=self.f0_min,
396
+ frame_period=10,
397
+ )
398
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sample_rate)
399
+ f0 = signal.medfilt(f0, 3)
400
+ elif f0_method == "crepe":
401
+ f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length))
402
+ elif f0_method == "crepe-tiny":
403
+ f0 = self.get_f0_crepe(
404
+ x, self.f0_min, self.f0_max, p_len, int(hop_length), "tiny"
405
+ )
406
+ elif f0_method == "rmvpe":
407
+ self.model_rmvpe = RMVPE0Predictor(
408
+ os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
409
+ is_half=self.is_half,
410
+ device=self.device,
411
+ )
412
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
413
+ elif f0_method == "fcpe":
414
+ self.model_fcpe = FCPEF0Predictor(
415
+ os.path.join("rvc", "models", "predictors", "fcpe.pt"),
416
+ f0_min=int(self.f0_min),
417
+ f0_max=int(self.f0_max),
418
+ dtype=torch.float32,
419
+ device=self.device,
420
+ sampling_rate=self.sample_rate,
421
+ threshold=0.03,
422
+ )
423
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
424
+ del self.model_fcpe
425
+ gc.collect()
426
+ elif "hybrid" in f0_method:
427
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
428
+ f0 = self.get_f0_hybrid(
429
+ f0_method,
430
+ x,
431
+ self.f0_min,
432
+ self.f0_max,
433
+ p_len,
434
+ hop_length,
435
+ )
436
+
437
+ if f0_autotune == "True":
438
+ f0 = Autotune.autotune_f0(self, f0)
439
+
440
+ f0 *= pow(2, f0_up_key / 12)
441
+ tf0 = self.sample_rate // self.window
442
+ if inp_f0 is not None:
443
+ delta_t = np.round(
444
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
445
+ ).astype("int16")
446
+ replace_f0 = np.interp(
447
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
448
+ )
449
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
450
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
451
+ :shape
452
+ ]
453
+ f0bak = f0.copy()
454
+ f0_mel = 1127 * np.log(1 + f0 / 700)
455
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
456
+ self.f0_mel_max - self.f0_mel_min
457
+ ) + 1
458
+ f0_mel[f0_mel <= 1] = 1
459
+ f0_mel[f0_mel > 255] = 255
460
+ f0_coarse = np.rint(f0_mel).astype(np.int)
461
+
462
+ return f0_coarse, f0bak
463
+
464
+ def voice_conversion(
465
+ self,
466
+ model,
467
+ net_g,
468
+ sid,
469
+ audio0,
470
+ pitch,
471
+ pitchf,
472
+ index,
473
+ big_npy,
474
+ index_rate,
475
+ version,
476
+ protect,
477
+ ):
478
+ """
479
+ Performs voice conversion on a given audio segment.
480
+
481
+ Args:
482
+ model: The feature extractor model.
483
+ net_g: The generative model for synthesizing speech.
484
+ sid: Speaker ID for the target voice.
485
+ audio0: The input audio segment.
486
+ pitch: Quantized F0 contour for pitch guidance.
487
+ pitchf: Original F0 contour for pitch guidance.
488
+ index: FAISS index for speaker embedding retrieval.
489
+ big_npy: Speaker embeddings stored in a NumPy array.
490
+ index_rate: Blending rate for speaker embedding retrieval.
491
+ version: Model version ("v1" or "v2").
492
+ protect: Protection level for preserving the original pitch.
493
+
494
+ Returns:
495
+ The voice-converted audio segment.
496
+ """
497
+ feats = torch.from_numpy(audio0)
498
+ if self.is_half:
499
+ feats = feats.half()
500
+ else:
501
+ feats = feats.float()
502
+ if feats.dim() == 2:
503
+ feats = feats.mean(-1)
504
+ assert feats.dim() == 1, feats.dim()
505
+ feats = feats.view(1, -1)
506
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
507
+
508
+ inputs = {
509
+ "source": feats.to(self.device),
510
+ "padding_mask": padding_mask,
511
+ "output_layer": 9 if version == "v1" else 12,
512
+ }
513
+ with torch.no_grad():
514
+ logits = model.extract_features(**inputs)
515
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
516
+ if protect < 0.5 and pitch != None and pitchf != None:
517
+ feats0 = feats.clone()
518
+ if (
519
+ isinstance(index, type(None)) == False
520
+ and isinstance(big_npy, type(None)) == False
521
+ and index_rate != 0
522
+ ):
523
+ npy = feats[0].cpu().numpy()
524
+ if self.is_half:
525
+ npy = npy.astype("float32")
526
+
527
+ score, ix = index.search(npy, k=8)
528
+ weight = np.square(1 / score)
529
+ weight /= weight.sum(axis=1, keepdims=True)
530
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
531
+
532
+ if self.is_half:
533
+ npy = npy.astype("float16")
534
+ feats = (
535
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
536
+ + (1 - index_rate) * feats
537
+ )
538
+
539
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
540
+ if protect < 0.5 and pitch != None and pitchf != None:
541
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
542
+ 0, 2, 1
543
+ )
544
+ p_len = audio0.shape[0] // self.window
545
+ if feats.shape[1] < p_len:
546
+ p_len = feats.shape[1]
547
+ if pitch != None and pitchf != None:
548
+ pitch = pitch[:, :p_len]
549
+ pitchf = pitchf[:, :p_len]
550
+
551
+ if protect < 0.5 and pitch != None and pitchf != None:
552
+ pitchff = pitchf.clone()
553
+ pitchff[pitchf > 0] = 1
554
+ pitchff[pitchf < 1] = protect
555
+ pitchff = pitchff.unsqueeze(-1)
556
+ feats = feats * pitchff + feats0 * (1 - pitchff)
557
+ feats = feats.to(feats0.dtype)
558
+ p_len = torch.tensor([p_len], device=self.device).long()
559
+ with torch.no_grad():
560
+ if pitch != None and pitchf != None:
561
+ audio1 = (
562
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
563
+ .data.cpu()
564
+ .float()
565
+ .numpy()
566
+ )
567
+ else:
568
+ audio1 = (
569
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
570
+ )
571
+ del feats, p_len, padding_mask
572
+ if torch.cuda.is_available():
573
+ torch.cuda.empty_cache()
574
+ return audio1
575
+
576
+ def pipeline(
577
+ self,
578
+ model,
579
+ net_g,
580
+ sid,
581
+ audio,
582
+ input_audio_path,
583
+ f0_up_key,
584
+ f0_method,
585
+ file_index,
586
+ index_rate,
587
+ pitch_guidance,
588
+ filter_radius,
589
+ tgt_sr,
590
+ resample_sr,
591
+ rms_mix_rate,
592
+ version,
593
+ protect,
594
+ hop_length,
595
+ f0_autotune,
596
+ f0_file,
597
+ ):
598
+ """
599
+ The main pipeline function for performing voice conversion.
600
+
601
+ Args:
602
+ model: The feature extractor model.
603
+ net_g: The generative model for synthesizing speech.
604
+ sid: Speaker ID for the target voice.
605
+ audio: The input audio signal.
606
+ input_audio_path: Path to the input audio file.
607
+ f0_up_key: Key to adjust the pitch of the F0 contour.
608
+ f0_method: Method to use for F0 estimation.
609
+ file_index: Path to the FAISS index file for speaker embedding retrieval.
610
+ index_rate: Blending rate for speaker embedding retrieval.
611
+ pitch_guidance: Whether to use pitch guidance during voice conversion.
612
+ filter_radius: Radius for median filtering the F0 contour.
613
+ tgt_sr: Target sampling rate for the output audio.
614
+ resample_sr: Resampling rate for the output audio.
615
+ rms_mix_rate: Blending rate for adjusting the RMS level of the output audio.
616
+ version: Model version.
617
+ protect: Protection level for preserving the original pitch.
618
+ hop_length: Hop length for F0 estimation methods.
619
+ f0_autotune: Whether to apply autotune to the F0 contour.
620
+ f0_file: Path to a file containing an F0 contour to use.
621
+
622
+ Returns:
623
+ The voice-converted audio signal.
624
+ """
625
+ if file_index != "" and os.path.exists(file_index) == True and index_rate != 0:
626
+ try:
627
+ index = faiss.read_index(file_index)
628
+ big_npy = index.reconstruct_n(0, index.ntotal)
629
+ except Exception as error:
630
+ print(error)
631
+ index = big_npy = None
632
+ else:
633
+ index = big_npy = None
634
+ audio = signal.filtfilt(bh, ah, audio)
635
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
636
+ opt_ts = []
637
+ if audio_pad.shape[0] > self.t_max:
638
+ audio_sum = np.zeros_like(audio)
639
+ for i in range(self.window):
640
+ audio_sum += audio_pad[i : i - self.window]
641
+ for t in range(self.t_center, audio.shape[0], self.t_center):
642
+ opt_ts.append(
643
+ t
644
+ - self.t_query
645
+ + np.where(
646
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
647
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
648
+ )[0][0]
649
+ )
650
+ s = 0
651
+ audio_opt = []
652
+ t = None
653
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
654
+ p_len = audio_pad.shape[0] // self.window
655
+ inp_f0 = None
656
+ if hasattr(f0_file, "name") == True:
657
+ try:
658
+ with open(f0_file.name, "r") as f:
659
+ lines = f.read().strip("\n").split("\n")
660
+ inp_f0 = []
661
+ for line in lines:
662
+ inp_f0.append([float(i) for i in line.split(",")])
663
+ inp_f0 = np.array(inp_f0, dtype="float32")
664
+ except Exception as error:
665
+ print(error)
666
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
667
+ pitch, pitchf = None, None
668
+ if pitch_guidance == 1:
669
+ pitch, pitchf = self.get_f0(
670
+ input_audio_path,
671
+ audio_pad,
672
+ p_len,
673
+ f0_up_key,
674
+ f0_method,
675
+ filter_radius,
676
+ hop_length,
677
+ f0_autotune,
678
+ inp_f0,
679
+ )
680
+ pitch = pitch[:p_len]
681
+ pitchf = pitchf[:p_len]
682
+ if self.device == "mps":
683
+ pitchf = pitchf.astype(np.float32)
684
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
685
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
686
+ for t in opt_ts:
687
+ t = t // self.window * self.window
688
+ if pitch_guidance == 1:
689
+ audio_opt.append(
690
+ self.voice_conversion(
691
+ model,
692
+ net_g,
693
+ sid,
694
+ audio_pad[s : t + self.t_pad2 + self.window],
695
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
696
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
697
+ index,
698
+ big_npy,
699
+ index_rate,
700
+ version,
701
+ protect,
702
+ )[self.t_pad_tgt : -self.t_pad_tgt]
703
+ )
704
+ else:
705
+ audio_opt.append(
706
+ self.voice_conversion(
707
+ model,
708
+ net_g,
709
+ sid,
710
+ audio_pad[s : t + self.t_pad2 + self.window],
711
+ None,
712
+ None,
713
+ index,
714
+ big_npy,
715
+ index_rate,
716
+ version,
717
+ protect,
718
+ )[self.t_pad_tgt : -self.t_pad_tgt]
719
+ )
720
+ s = t
721
+ if pitch_guidance == 1:
722
+ audio_opt.append(
723
+ self.voice_conversion(
724
+ model,
725
+ net_g,
726
+ sid,
727
+ audio_pad[t:],
728
+ pitch[:, t // self.window :] if t is not None else pitch,
729
+ pitchf[:, t // self.window :] if t is not None else pitchf,
730
+ index,
731
+ big_npy,
732
+ index_rate,
733
+ version,
734
+ protect,
735
+ )[self.t_pad_tgt : -self.t_pad_tgt]
736
+ )
737
+ else:
738
+ audio_opt.append(
739
+ self.voice_conversion(
740
+ model,
741
+ net_g,
742
+ sid,
743
+ audio_pad[t:],
744
+ None,
745
+ None,
746
+ index,
747
+ big_npy,
748
+ index_rate,
749
+ version,
750
+ protect,
751
+ )[self.t_pad_tgt : -self.t_pad_tgt]
752
+ )
753
+ audio_opt = np.concatenate(audio_opt)
754
+ if rms_mix_rate != 1:
755
+ audio_opt = AudioProcessor.change_rms(
756
+ audio, self.sample_rate, audio_opt, tgt_sr, rms_mix_rate
757
+ )
758
+ if resample_sr >= self.sample_rate and tgt_sr != resample_sr:
759
+ audio_opt = librosa.resample(
760
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
761
+ )
762
+ audio_max = np.abs(audio_opt).max() / 0.99
763
+ max_int16 = 32768
764
+ if audio_max > 1:
765
+ max_int16 /= audio_max
766
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
767
+ del pitch, pitchf, sid
768
+ if torch.cuda.is_available():
769
+ torch.cuda.empty_cache()
770
+ return audio_opt