Spaces:
Running
Running
Kit-Lemonfoot
commited on
Commit
•
cf9bca8
1
Parent(s):
73e6a9f
Added some experimental preloading of the RMVPE and VC models to hopefully ease inference time. May break stuff.
Browse files- app.py +19 -8
- vc_infer_pipeline.py +17 -9
app.py
CHANGED
@@ -33,15 +33,26 @@ limitation = os.getenv("SYSTEM") == "spaces"
|
|
33 |
#limitation=True
|
34 |
|
35 |
audio_mode = []
|
36 |
-
f0method_mode = [
|
37 |
-
f0method_info = "PM is fast but low quality, crepe and harvest are slow but good quality, RMVPE is the best of both worlds. (Default: RMVPE))"
|
38 |
if limitation is True:
|
|
|
39 |
audio_mode = ["TTS Audio", "Upload audio"]
|
|
|
40 |
else:
|
|
|
41 |
audio_mode = ["TTS Audio", "Youtube", "Upload audio"]
|
|
|
42 |
|
43 |
-
if os.path.isfile("rmvpe.pt"):
|
44 |
-
f0method_mode.append("rmvpe")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
def infer(name, path, index, vc_audio_mode, vc_input, vc_upload, tts_text, tts_voice, f0_up_key, f0_method, index_rate, filter_radius, resample_sr, rms_mix_rate, protect):
|
47 |
try:
|
@@ -99,10 +110,10 @@ def infer(name, path, index, vc_audio_mode, vc_input, vc_upload, tts_text, tts_v
|
|
99 |
net_g = net_g.half()
|
100 |
else:
|
101 |
net_g = net_g.float()
|
102 |
-
|
103 |
|
104 |
#Gen audio
|
105 |
-
audio_opt =
|
106 |
hubert_model,
|
107 |
net_g,
|
108 |
0,
|
@@ -125,7 +136,7 @@ def infer(name, path, index, vc_audio_mode, vc_input, vc_upload, tts_text, tts_v
|
|
125 |
)
|
126 |
info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
|
127 |
print(f"Successful inference with model {name} | {tts_text} | {info}")
|
128 |
-
del net_g,
|
129 |
return info, (tgt_sr, audio_opt)
|
130 |
except:
|
131 |
info = traceback.format_exc()
|
@@ -516,7 +527,7 @@ if __name__ == '__main__':
|
|
516 |
"#### <center>Original devs:\n"
|
517 |
"<center>the RVC Project, lj1995, zomehwh \n\n"
|
518 |
"#### <center>Model creators:\n"
|
519 |
-
"<center>dacoolkid44, Hijack, Maki Ligon, megaaziib, KitLemonfoot, yeey5, Sui, MahdeenSky, Itaxhix, Acato, Kyuubical,
|
520 |
)
|
521 |
if limitation is True:
|
522 |
app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
|
|
|
33 |
#limitation=True
|
34 |
|
35 |
audio_mode = []
|
36 |
+
f0method_mode = []
|
|
|
37 |
if limitation is True:
|
38 |
+
f0method_info = "PM is better for testing, RMVPE is better for finalized generations. (Default: RMVPE)"
|
39 |
audio_mode = ["TTS Audio", "Upload audio"]
|
40 |
+
f0method_mode = ["pm", "rmvpe"]
|
41 |
else:
|
42 |
+
f0method_info = "PM is fast but low quality, crepe and harvest are slow but good quality, RMVPE is the best of both worlds. (Default: RMVPE)"
|
43 |
audio_mode = ["TTS Audio", "Youtube", "Upload audio"]
|
44 |
+
f0method_mode = ["pm", "crepe", "harvest", "rmvpe"]
|
45 |
|
46 |
+
#if os.path.isfile("rmvpe.pt"):
|
47 |
+
# f0method_mode.append("rmvpe")
|
48 |
+
|
49 |
+
|
50 |
+
#Eagerload VCs
|
51 |
+
print("Preloading VCs...")
|
52 |
+
vcArr=[]
|
53 |
+
vcArr.append(VC(32000, config))
|
54 |
+
vcArr.append(VC(40000, config))
|
55 |
+
vcArr.append(VC(48000, config))
|
56 |
|
57 |
def infer(name, path, index, vc_audio_mode, vc_input, vc_upload, tts_text, tts_voice, f0_up_key, f0_method, index_rate, filter_radius, resample_sr, rms_mix_rate, protect):
|
58 |
try:
|
|
|
110 |
net_g = net_g.half()
|
111 |
else:
|
112 |
net_g = net_g.float()
|
113 |
+
vcIdx = int((tgt_sr/8000)-4)
|
114 |
|
115 |
#Gen audio
|
116 |
+
audio_opt = vcArr[vcIdx].pipeline(
|
117 |
hubert_model,
|
118 |
net_g,
|
119 |
0,
|
|
|
136 |
)
|
137 |
info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
|
138 |
print(f"Successful inference with model {name} | {tts_text} | {info}")
|
139 |
+
del net_g, cpt
|
140 |
return info, (tgt_sr, audio_opt)
|
141 |
except:
|
142 |
info = traceback.format_exc()
|
|
|
527 |
"#### <center>Original devs:\n"
|
528 |
"<center>the RVC Project, lj1995, zomehwh \n\n"
|
529 |
"#### <center>Model creators:\n"
|
530 |
+
"<center>dacoolkid44, Hijack, Maki Ligon, megaaziib, KitLemonfoot, yeey5, Sui, MahdeenSky, Itaxhix, Acato, Kyuubical, Listra92, IshimaIshimsky, ZomballTH, Jotape91, RigidSpinner, RandomAssBettel, Mimizukari, Oida, Shu-Kun, Nhat Minh, Ardha27, Legitdark, TempoHawk, 0x3e9, Kaiaya, Skeetawn, Sonphantrung, Pianissimo, Gloomwastragic, Sunesu, Aimbo, Act8113, Blyxeen\n"
|
531 |
)
|
532 |
if limitation is True:
|
533 |
app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
|
vc_infer_pipeline.py
CHANGED
@@ -13,6 +13,14 @@ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
|
|
13 |
|
14 |
input_audio_path2wav = {}
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
@lru_cache
|
18 |
def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
|
@@ -128,14 +136,14 @@ class VC(object):
|
|
128 |
f0[pd < 0.1] = 0
|
129 |
f0 = f0[0].cpu().numpy()
|
130 |
elif f0_method == "rmvpe":
|
131 |
-
if hasattr(self, "model_rmvpe") == False:
|
132 |
-
from rmvpe import RMVPE
|
133 |
-
|
134 |
-
print("loading rmvpe model")
|
135 |
-
self.model_rmvpe = RMVPE(
|
136 |
-
"rmvpe.pt", is_half=self.is_half, device=self.device
|
137 |
-
)
|
138 |
-
f0 =
|
139 |
f0 *= pow(2, f0_up_key / 12)
|
140 |
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
|
141 |
tf0 = self.sr // self.window # 每秒f0点数
|
@@ -440,4 +448,4 @@ class VC(object):
|
|
440 |
del pitch, pitchf, sid
|
441 |
if torch.cuda.is_available():
|
442 |
torch.cuda.empty_cache()
|
443 |
-
return audio_opt
|
|
|
13 |
|
14 |
input_audio_path2wav = {}
|
15 |
|
16 |
+
#Attempting a eagerload of the RMVPE model here.
|
17 |
+
from config import Config
|
18 |
+
config = Config()
|
19 |
+
from rmvpe import RMVPE
|
20 |
+
print("Preloading RMVPE model")
|
21 |
+
model_rmvpe = RMVPE("rmvpe.pt", is_half=config.is_half, device=config.device)
|
22 |
+
del config
|
23 |
+
|
24 |
|
25 |
@lru_cache
|
26 |
def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
|
|
|
136 |
f0[pd < 0.1] = 0
|
137 |
f0 = f0[0].cpu().numpy()
|
138 |
elif f0_method == "rmvpe":
|
139 |
+
## if hasattr(self, "model_rmvpe") == False:
|
140 |
+
## from rmvpe import RMVPE
|
141 |
+
##
|
142 |
+
## print("loading rmvpe model")
|
143 |
+
## self.model_rmvpe = RMVPE(
|
144 |
+
## "rmvpe.pt", is_half=self.is_half, device=self.device
|
145 |
+
## )
|
146 |
+
f0 = model_rmvpe.infer_from_audio(x, thred=0.03)
|
147 |
f0 *= pow(2, f0_up_key / 12)
|
148 |
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
|
149 |
tf0 = self.sr // self.window # 每秒f0点数
|
|
|
448 |
del pitch, pitchf, sid
|
449 |
if torch.cuda.is_available():
|
450 |
torch.cuda.empty_cache()
|
451 |
+
return audio_opt
|