Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: update infer
Browse files- infer_pack/models_onnx.py +2 -1
- infer_pack/modules/F0Predictor/DioF0Predictor.py +90 -0
- infer_pack/modules/F0Predictor/F0Predictor.py +16 -0
- infer_pack/modules/F0Predictor/HarvestF0Predictor.py +86 -0
- infer_pack/modules/F0Predictor/PMF0Predictor.py +97 -0
- infer_pack/modules/F0Predictor/__init__.py +0 -0
- infer_pack/onnx_inference.py +142 -0
- vc_infer_pipeline.py +3 -3
infer_pack/models_onnx.py
CHANGED
@@ -550,6 +550,7 @@ class SynthesizerTrnMsNSFsidM(nn.Module):
|
|
550 |
spk_embed_dim,
|
551 |
gin_channels,
|
552 |
sr,
|
|
|
553 |
**kwargs
|
554 |
):
|
555 |
super().__init__()
|
@@ -573,7 +574,7 @@ class SynthesizerTrnMsNSFsidM(nn.Module):
|
|
573 |
self.gin_channels = gin_channels
|
574 |
# self.hop_length = hop_length#
|
575 |
self.spk_embed_dim = spk_embed_dim
|
576 |
-
if
|
577 |
self.enc_p = TextEncoder256(
|
578 |
inter_channels,
|
579 |
hidden_channels,
|
|
|
550 |
spk_embed_dim,
|
551 |
gin_channels,
|
552 |
sr,
|
553 |
+
version,
|
554 |
**kwargs
|
555 |
):
|
556 |
super().__init__()
|
|
|
574 |
self.gin_channels = gin_channels
|
575 |
# self.hop_length = hop_length#
|
576 |
self.spk_embed_dim = spk_embed_dim
|
577 |
+
if version == "v1":
|
578 |
self.enc_p = TextEncoder256(
|
579 |
inter_channels,
|
580 |
hidden_channels,
|
infer_pack/modules/F0Predictor/DioF0Predictor.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
+
import pyworld
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class DioF0Predictor(F0Predictor):
|
7 |
+
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
|
8 |
+
self.hop_length = hop_length
|
9 |
+
self.f0_min = f0_min
|
10 |
+
self.f0_max = f0_max
|
11 |
+
self.sampling_rate = sampling_rate
|
12 |
+
|
13 |
+
def interpolate_f0(self, f0):
|
14 |
+
"""
|
15 |
+
对F0进行插值处理
|
16 |
+
"""
|
17 |
+
|
18 |
+
data = np.reshape(f0, (f0.size, 1))
|
19 |
+
|
20 |
+
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
|
21 |
+
vuv_vector[data > 0.0] = 1.0
|
22 |
+
vuv_vector[data <= 0.0] = 0.0
|
23 |
+
|
24 |
+
ip_data = data
|
25 |
+
|
26 |
+
frame_number = data.size
|
27 |
+
last_value = 0.0
|
28 |
+
for i in range(frame_number):
|
29 |
+
if data[i] <= 0.0:
|
30 |
+
j = i + 1
|
31 |
+
for j in range(i + 1, frame_number):
|
32 |
+
if data[j] > 0.0:
|
33 |
+
break
|
34 |
+
if j < frame_number - 1:
|
35 |
+
if last_value > 0.0:
|
36 |
+
step = (data[j] - data[i - 1]) / float(j - i)
|
37 |
+
for k in range(i, j):
|
38 |
+
ip_data[k] = data[i - 1] + step * (k - i + 1)
|
39 |
+
else:
|
40 |
+
for k in range(i, j):
|
41 |
+
ip_data[k] = data[j]
|
42 |
+
else:
|
43 |
+
for k in range(i, frame_number):
|
44 |
+
ip_data[k] = last_value
|
45 |
+
else:
|
46 |
+
ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
|
47 |
+
last_value = data[i]
|
48 |
+
|
49 |
+
return ip_data[:, 0], vuv_vector[:, 0]
|
50 |
+
|
51 |
+
def resize_f0(self, x, target_len):
|
52 |
+
source = np.array(x)
|
53 |
+
source[source < 0.001] = np.nan
|
54 |
+
target = np.interp(
|
55 |
+
np.arange(0, len(source) * target_len, len(source)) / target_len,
|
56 |
+
np.arange(0, len(source)),
|
57 |
+
source,
|
58 |
+
)
|
59 |
+
res = np.nan_to_num(target)
|
60 |
+
return res
|
61 |
+
|
62 |
+
def compute_f0(self, wav, p_len=None):
|
63 |
+
if p_len is None:
|
64 |
+
p_len = wav.shape[0] // self.hop_length
|
65 |
+
f0, t = pyworld.dio(
|
66 |
+
wav.astype(np.double),
|
67 |
+
fs=self.sampling_rate,
|
68 |
+
f0_floor=self.f0_min,
|
69 |
+
f0_ceil=self.f0_max,
|
70 |
+
frame_period=1000 * self.hop_length / self.sampling_rate,
|
71 |
+
)
|
72 |
+
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
73 |
+
for index, pitch in enumerate(f0):
|
74 |
+
f0[index] = round(pitch, 1)
|
75 |
+
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
|
76 |
+
|
77 |
+
def compute_f0_uv(self, wav, p_len=None):
|
78 |
+
if p_len is None:
|
79 |
+
p_len = wav.shape[0] // self.hop_length
|
80 |
+
f0, t = pyworld.dio(
|
81 |
+
wav.astype(np.double),
|
82 |
+
fs=self.sampling_rate,
|
83 |
+
f0_floor=self.f0_min,
|
84 |
+
f0_ceil=self.f0_max,
|
85 |
+
frame_period=1000 * self.hop_length / self.sampling_rate,
|
86 |
+
)
|
87 |
+
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
88 |
+
for index, pitch in enumerate(f0):
|
89 |
+
f0[index] = round(pitch, 1)
|
90 |
+
return self.interpolate_f0(self.resize_f0(f0, p_len))
|
infer_pack/modules/F0Predictor/F0Predictor.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class F0Predictor(object):
|
2 |
+
def compute_f0(self, wav, p_len):
|
3 |
+
"""
|
4 |
+
input: wav:[signal_length]
|
5 |
+
p_len:int
|
6 |
+
output: f0:[signal_length//hop_length]
|
7 |
+
"""
|
8 |
+
pass
|
9 |
+
|
10 |
+
def compute_f0_uv(self, wav, p_len):
|
11 |
+
"""
|
12 |
+
input: wav:[signal_length]
|
13 |
+
p_len:int
|
14 |
+
output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
|
15 |
+
"""
|
16 |
+
pass
|
infer_pack/modules/F0Predictor/HarvestF0Predictor.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
+
import pyworld
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class HarvestF0Predictor(F0Predictor):
|
7 |
+
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
|
8 |
+
self.hop_length = hop_length
|
9 |
+
self.f0_min = f0_min
|
10 |
+
self.f0_max = f0_max
|
11 |
+
self.sampling_rate = sampling_rate
|
12 |
+
|
13 |
+
def interpolate_f0(self, f0):
|
14 |
+
"""
|
15 |
+
对F0进行插值处理
|
16 |
+
"""
|
17 |
+
|
18 |
+
data = np.reshape(f0, (f0.size, 1))
|
19 |
+
|
20 |
+
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
|
21 |
+
vuv_vector[data > 0.0] = 1.0
|
22 |
+
vuv_vector[data <= 0.0] = 0.0
|
23 |
+
|
24 |
+
ip_data = data
|
25 |
+
|
26 |
+
frame_number = data.size
|
27 |
+
last_value = 0.0
|
28 |
+
for i in range(frame_number):
|
29 |
+
if data[i] <= 0.0:
|
30 |
+
j = i + 1
|
31 |
+
for j in range(i + 1, frame_number):
|
32 |
+
if data[j] > 0.0:
|
33 |
+
break
|
34 |
+
if j < frame_number - 1:
|
35 |
+
if last_value > 0.0:
|
36 |
+
step = (data[j] - data[i - 1]) / float(j - i)
|
37 |
+
for k in range(i, j):
|
38 |
+
ip_data[k] = data[i - 1] + step * (k - i + 1)
|
39 |
+
else:
|
40 |
+
for k in range(i, j):
|
41 |
+
ip_data[k] = data[j]
|
42 |
+
else:
|
43 |
+
for k in range(i, frame_number):
|
44 |
+
ip_data[k] = last_value
|
45 |
+
else:
|
46 |
+
ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
|
47 |
+
last_value = data[i]
|
48 |
+
|
49 |
+
return ip_data[:, 0], vuv_vector[:, 0]
|
50 |
+
|
51 |
+
def resize_f0(self, x, target_len):
|
52 |
+
source = np.array(x)
|
53 |
+
source[source < 0.001] = np.nan
|
54 |
+
target = np.interp(
|
55 |
+
np.arange(0, len(source) * target_len, len(source)) / target_len,
|
56 |
+
np.arange(0, len(source)),
|
57 |
+
source,
|
58 |
+
)
|
59 |
+
res = np.nan_to_num(target)
|
60 |
+
return res
|
61 |
+
|
62 |
+
def compute_f0(self, wav, p_len=None):
|
63 |
+
if p_len is None:
|
64 |
+
p_len = wav.shape[0] // self.hop_length
|
65 |
+
f0, t = pyworld.harvest(
|
66 |
+
wav.astype(np.double),
|
67 |
+
fs=self.hop_length,
|
68 |
+
f0_ceil=self.f0_max,
|
69 |
+
f0_floor=self.f0_min,
|
70 |
+
frame_period=1000 * self.hop_length / self.sampling_rate,
|
71 |
+
)
|
72 |
+
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
|
73 |
+
return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
|
74 |
+
|
75 |
+
def compute_f0_uv(self, wav, p_len=None):
|
76 |
+
if p_len is None:
|
77 |
+
p_len = wav.shape[0] // self.hop_length
|
78 |
+
f0, t = pyworld.harvest(
|
79 |
+
wav.astype(np.double),
|
80 |
+
fs=self.sampling_rate,
|
81 |
+
f0_floor=self.f0_min,
|
82 |
+
f0_ceil=self.f0_max,
|
83 |
+
frame_period=1000 * self.hop_length / self.sampling_rate,
|
84 |
+
)
|
85 |
+
f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
|
86 |
+
return self.interpolate_f0(self.resize_f0(f0, p_len))
|
infer_pack/modules/F0Predictor/PMF0Predictor.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
|
2 |
+
import parselmouth
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class PMF0Predictor(F0Predictor):
|
7 |
+
def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
|
8 |
+
self.hop_length = hop_length
|
9 |
+
self.f0_min = f0_min
|
10 |
+
self.f0_max = f0_max
|
11 |
+
self.sampling_rate = sampling_rate
|
12 |
+
|
13 |
+
def interpolate_f0(self, f0):
|
14 |
+
"""
|
15 |
+
对F0进行插值处理
|
16 |
+
"""
|
17 |
+
|
18 |
+
data = np.reshape(f0, (f0.size, 1))
|
19 |
+
|
20 |
+
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
|
21 |
+
vuv_vector[data > 0.0] = 1.0
|
22 |
+
vuv_vector[data <= 0.0] = 0.0
|
23 |
+
|
24 |
+
ip_data = data
|
25 |
+
|
26 |
+
frame_number = data.size
|
27 |
+
last_value = 0.0
|
28 |
+
for i in range(frame_number):
|
29 |
+
if data[i] <= 0.0:
|
30 |
+
j = i + 1
|
31 |
+
for j in range(i + 1, frame_number):
|
32 |
+
if data[j] > 0.0:
|
33 |
+
break
|
34 |
+
if j < frame_number - 1:
|
35 |
+
if last_value > 0.0:
|
36 |
+
step = (data[j] - data[i - 1]) / float(j - i)
|
37 |
+
for k in range(i, j):
|
38 |
+
ip_data[k] = data[i - 1] + step * (k - i + 1)
|
39 |
+
else:
|
40 |
+
for k in range(i, j):
|
41 |
+
ip_data[k] = data[j]
|
42 |
+
else:
|
43 |
+
for k in range(i, frame_number):
|
44 |
+
ip_data[k] = last_value
|
45 |
+
else:
|
46 |
+
ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
|
47 |
+
last_value = data[i]
|
48 |
+
|
49 |
+
return ip_data[:, 0], vuv_vector[:, 0]
|
50 |
+
|
51 |
+
def compute_f0(self, wav, p_len=None):
|
52 |
+
x = wav
|
53 |
+
if p_len is None:
|
54 |
+
p_len = x.shape[0] // self.hop_length
|
55 |
+
else:
|
56 |
+
assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
|
57 |
+
time_step = self.hop_length / self.sampling_rate * 1000
|
58 |
+
f0 = (
|
59 |
+
parselmouth.Sound(x, self.sampling_rate)
|
60 |
+
.to_pitch_ac(
|
61 |
+
time_step=time_step / 1000,
|
62 |
+
voicing_threshold=0.6,
|
63 |
+
pitch_floor=self.f0_min,
|
64 |
+
pitch_ceiling=self.f0_max,
|
65 |
+
)
|
66 |
+
.selected_array["frequency"]
|
67 |
+
)
|
68 |
+
|
69 |
+
pad_size = (p_len - len(f0) + 1) // 2
|
70 |
+
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
71 |
+
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
|
72 |
+
f0, uv = self.interpolate_f0(f0)
|
73 |
+
return f0
|
74 |
+
|
75 |
+
def compute_f0_uv(self, wav, p_len=None):
|
76 |
+
x = wav
|
77 |
+
if p_len is None:
|
78 |
+
p_len = x.shape[0] // self.hop_length
|
79 |
+
else:
|
80 |
+
assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
|
81 |
+
time_step = self.hop_length / self.sampling_rate * 1000
|
82 |
+
f0 = (
|
83 |
+
parselmouth.Sound(x, self.sampling_rate)
|
84 |
+
.to_pitch_ac(
|
85 |
+
time_step=time_step / 1000,
|
86 |
+
voicing_threshold=0.6,
|
87 |
+
pitch_floor=self.f0_min,
|
88 |
+
pitch_ceiling=self.f0_max,
|
89 |
+
)
|
90 |
+
.selected_array["frequency"]
|
91 |
+
)
|
92 |
+
|
93 |
+
pad_size = (p_len - len(f0) + 1) // 2
|
94 |
+
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
95 |
+
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
|
96 |
+
f0, uv = self.interpolate_f0(f0)
|
97 |
+
return f0, uv
|
infer_pack/modules/F0Predictor/__init__.py
ADDED
File without changes
|
infer_pack/onnx_inference.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import onnxruntime
|
2 |
+
import librosa
|
3 |
+
import numpy as np
|
4 |
+
import soundfile
|
5 |
+
|
6 |
+
class ContentVec:
|
7 |
+
def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
|
8 |
+
print("load model(s) from {}".format(vec_path))
|
9 |
+
if device == "cpu" or device is None:
|
10 |
+
providers = ["CPUExecutionProvider"]
|
11 |
+
elif device == "cuda":
|
12 |
+
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
13 |
+
elif device == "dml":
|
14 |
+
providers = ["DmlExecutionProvider"]
|
15 |
+
else:
|
16 |
+
raise RuntimeError("Unsportted Device")
|
17 |
+
self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
|
18 |
+
|
19 |
+
def __call__(self, wav):
|
20 |
+
return self.forward(wav)
|
21 |
+
|
22 |
+
def forward(self, wav):
|
23 |
+
feats = wav
|
24 |
+
if feats.ndim == 2: # double channels
|
25 |
+
feats = feats.mean(-1)
|
26 |
+
assert feats.ndim == 1, feats.ndim
|
27 |
+
feats = np.expand_dims(np.expand_dims(feats, 0), 0)
|
28 |
+
onnx_input = {self.model.get_inputs()[0].name: feats}
|
29 |
+
logits = self.model.run(None, onnx_input)[0]
|
30 |
+
return logits.transpose(0, 2, 1)
|
31 |
+
|
32 |
+
|
33 |
+
def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
|
34 |
+
if f0_predictor == "pm":
|
35 |
+
from infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
|
36 |
+
|
37 |
+
f0_predictor_object = PMF0Predictor(
|
38 |
+
hop_length=hop_length, sampling_rate=sampling_rate
|
39 |
+
)
|
40 |
+
elif f0_predictor == "harvest":
|
41 |
+
from infer_pack.modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor
|
42 |
+
|
43 |
+
f0_predictor_object = HarvestF0Predictor(
|
44 |
+
hop_length=hop_length, sampling_rate=sampling_rate
|
45 |
+
)
|
46 |
+
elif f0_predictor == "dio":
|
47 |
+
from infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
|
48 |
+
|
49 |
+
f0_predictor_object = DioF0Predictor(
|
50 |
+
hop_length=hop_length, sampling_rate=sampling_rate
|
51 |
+
)
|
52 |
+
else:
|
53 |
+
raise Exception("Unknown f0 predictor")
|
54 |
+
return f0_predictor_object
|
55 |
+
|
56 |
+
|
57 |
+
class OnnxRVC:
|
58 |
+
def __init__(
|
59 |
+
self,
|
60 |
+
model_path,
|
61 |
+
sr=40000,
|
62 |
+
hop_size=512,
|
63 |
+
vec_path="vec-768-layer-12",
|
64 |
+
device="cpu",
|
65 |
+
):
|
66 |
+
vec_path = f"pretrained/{vec_path}.onnx"
|
67 |
+
self.vec_model = ContentVec(vec_path, device)
|
68 |
+
if device == "cpu" or device is None:
|
69 |
+
providers = ["CPUExecutionProvider"]
|
70 |
+
elif device == "cuda":
|
71 |
+
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
72 |
+
elif device == "dml":
|
73 |
+
providers = ["DmlExecutionProvider"]
|
74 |
+
else:
|
75 |
+
raise RuntimeError("Unsportted Device")
|
76 |
+
self.model = onnxruntime.InferenceSession(model_path, providers=providers)
|
77 |
+
self.sampling_rate = sr
|
78 |
+
self.hop_size = hop_size
|
79 |
+
|
80 |
+
def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd):
|
81 |
+
onnx_input = {
|
82 |
+
self.model.get_inputs()[0].name: hubert,
|
83 |
+
self.model.get_inputs()[1].name: hubert_length,
|
84 |
+
self.model.get_inputs()[2].name: pitch,
|
85 |
+
self.model.get_inputs()[3].name: pitchf,
|
86 |
+
self.model.get_inputs()[4].name: ds,
|
87 |
+
self.model.get_inputs()[5].name: rnd,
|
88 |
+
}
|
89 |
+
return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16)
|
90 |
+
|
91 |
+
def inference(
|
92 |
+
self,
|
93 |
+
raw_path,
|
94 |
+
sid,
|
95 |
+
f0_method="dio",
|
96 |
+
f0_up_key=0,
|
97 |
+
pad_time=0.5,
|
98 |
+
cr_threshold=0.02,
|
99 |
+
):
|
100 |
+
f0_min = 50
|
101 |
+
f0_max = 1100
|
102 |
+
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
103 |
+
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
104 |
+
f0_predictor = get_f0_predictor(
|
105 |
+
f0_method,
|
106 |
+
hop_length=self.hop_size,
|
107 |
+
sampling_rate=self.sampling_rate,
|
108 |
+
threshold=cr_threshold,
|
109 |
+
)
|
110 |
+
wav, sr = librosa.load(raw_path, sr=self.sampling_rate)
|
111 |
+
org_length = len(wav)
|
112 |
+
if org_length / sr > 50.0:
|
113 |
+
raise RuntimeError("Reached Max Length")
|
114 |
+
|
115 |
+
wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000)
|
116 |
+
wav16k = wav16k
|
117 |
+
|
118 |
+
hubert = self.vec_model(wav16k)
|
119 |
+
hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
|
120 |
+
hubert_length = hubert.shape[1]
|
121 |
+
|
122 |
+
pitchf = f0_predictor.compute_f0(wav, hubert_length)
|
123 |
+
pitchf = pitchf * 2 ** (f0_up_key / 12)
|
124 |
+
pitch = pitchf.copy()
|
125 |
+
f0_mel = 1127 * np.log(1 + pitch / 700)
|
126 |
+
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
127 |
+
f0_mel_max - f0_mel_min
|
128 |
+
) + 1
|
129 |
+
f0_mel[f0_mel <= 1] = 1
|
130 |
+
f0_mel[f0_mel > 255] = 255
|
131 |
+
pitch = np.rint(f0_mel).astype(np.int64)
|
132 |
+
|
133 |
+
pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
|
134 |
+
pitch = pitch.reshape(1, len(pitch))
|
135 |
+
ds = np.array([sid]).astype(np.int64)
|
136 |
+
|
137 |
+
rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
|
138 |
+
hubert_length = np.array([hubert_length]).astype(np.int64)
|
139 |
+
|
140 |
+
out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze()
|
141 |
+
out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
|
142 |
+
return out_wav[0:org_length]
|
vc_infer_pipeline.py
CHANGED
@@ -184,7 +184,7 @@ class VC(object):
|
|
184 |
with torch.no_grad():
|
185 |
logits = model.extract_features(**inputs)
|
186 |
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
187 |
-
if protect < 0.5:
|
188 |
feats0 = feats.clone()
|
189 |
if (
|
190 |
isinstance(index, type(None)) == False
|
@@ -211,7 +211,7 @@ class VC(object):
|
|
211 |
)
|
212 |
|
213 |
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
214 |
-
if protect < 0.5:
|
215 |
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
|
216 |
0, 2, 1
|
217 |
)
|
@@ -223,7 +223,7 @@ class VC(object):
|
|
223 |
pitch = pitch[:, :p_len]
|
224 |
pitchf = pitchf[:, :p_len]
|
225 |
|
226 |
-
if protect < 0.5:
|
227 |
pitchff = pitchf.clone()
|
228 |
pitchff[pitchf > 0] = 1
|
229 |
pitchff[pitchf < 1] = protect
|
|
|
184 |
with torch.no_grad():
|
185 |
logits = model.extract_features(**inputs)
|
186 |
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
187 |
+
if protect < 0.5 and pitch!=None and pitchf!=None:
|
188 |
feats0 = feats.clone()
|
189 |
if (
|
190 |
isinstance(index, type(None)) == False
|
|
|
211 |
)
|
212 |
|
213 |
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
214 |
+
if protect < 0.5 and pitch!=None and pitchf!=None:
|
215 |
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
|
216 |
0, 2, 1
|
217 |
)
|
|
|
223 |
pitch = pitch[:, :p_len]
|
224 |
pitchf = pitchf[:, :p_len]
|
225 |
|
226 |
+
if protect < 0.5 and pitch!=None and pitchf!=None:
|
227 |
pitchff = pitchf.clone()
|
228 |
pitchff[pitchf > 0] = 1
|
229 |
pitchff[pitchf < 1] = protect
|