vtube_rvc

Running

App Files Files Community

Kit-Lemonfoot commited on Nov 27, 2023

Commit

cf9bca8

•

1 Parent(s): 73e6a9f

Added some experimental preloading of the RMVPE and VC models to hopefully ease inference time. May break stuff.

Browse files

Files changed (2) hide show

app.py +19 -8
vc_infer_pipeline.py +17 -9

app.py CHANGED Viewed

@@ -33,15 +33,26 @@ limitation = os.getenv("SYSTEM") == "spaces"
 #limitation=True
 audio_mode = []
-f0method_mode = ["pm", "crepe", "harvest"]
-f0method_info = "PM is fast but low quality, crepe and harvest are slow but good quality, RMVPE is the best of both worlds. (Default: RMVPE))"
 if limitation is True:
     audio_mode = ["TTS Audio", "Upload audio"]
 else:
     audio_mode = ["TTS Audio", "Youtube", "Upload audio"]
-if os.path.isfile("rmvpe.pt"):
-    f0method_mode.append("rmvpe")
 def infer(name, path, index, vc_audio_mode, vc_input, vc_upload, tts_text, tts_voice, f0_up_key, f0_method, index_rate, filter_radius, resample_sr, rms_mix_rate, protect):
     try:
@@ -99,10 +110,10 @@ def infer(name, path, index, vc_audio_mode, vc_input, vc_upload, tts_text, tts_v
             net_g = net_g.half()
         else:
             net_g = net_g.float()
-        vc = VC(tgt_sr, config)
         #Gen audio
-        audio_opt = vc.pipeline(
             hubert_model,
             net_g,
             0,
@@ -125,7 +136,7 @@ def infer(name, path, index, vc_audio_mode, vc_input, vc_upload, tts_text, tts_v
         )
         info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
         print(f"Successful inference with model {name} | {tts_text} | {info}")
-        del net_g, vc, cpt
         return info, (tgt_sr, audio_opt)
     except:
         info = traceback.format_exc()
@@ -516,7 +527,7 @@ if __name__ == '__main__':
             "#### <center>Original devs:\n"
             "<center>the RVC Project, lj1995, zomehwh \n\n"
             "#### <center>Model creators:\n"
-            "<center>dacoolkid44, Hijack, Maki Ligon, megaaziib, KitLemonfoot, yeey5, Sui, MahdeenSky, Itaxhix, Acato, Kyuubical, MartinFLL, Listra92, IshimaIshimsky, ZomballTH, Jotape91, RigidSpinner, RandomAssBettel, Mimizukari, Oida, Shu-Kun, Nhat Minh, Ardha27, Legitdark, TempoHawk, 0x3e9, Kaiaya, Skeetawn, Sonphantrung, Pianissimo, Gloomwastragic, Sunesu, Aimbo, Act8113, Blyxeen\n"
         )
 if limitation is True:
     app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)

 #limitation=True
 audio_mode = []
+f0method_mode = []
 if limitation is True:
+    f0method_info = "PM is better for testing, RMVPE is better for finalized generations. (Default: RMVPE)"
     audio_mode = ["TTS Audio", "Upload audio"]
+    f0method_mode = ["pm", "rmvpe"]
 else:
+    f0method_info = "PM is fast but low quality, crepe and harvest are slow but good quality, RMVPE is the best of both worlds. (Default: RMVPE)"
     audio_mode = ["TTS Audio", "Youtube", "Upload audio"]
+    f0method_mode = ["pm", "crepe", "harvest", "rmvpe"]
+#if os.path.isfile("rmvpe.pt"):
+#    f0method_mode.append("rmvpe")
+#Eagerload VCs
+print("Preloading VCs...")
+vcArr=[]
+vcArr.append(VC(32000, config))
+vcArr.append(VC(40000, config))
+vcArr.append(VC(48000, config))
 def infer(name, path, index, vc_audio_mode, vc_input, vc_upload, tts_text, tts_voice, f0_up_key, f0_method, index_rate, filter_radius, resample_sr, rms_mix_rate, protect):
     try:
             net_g = net_g.half()
         else:
             net_g = net_g.float()
+        vcIdx = int((tgt_sr/8000)-4)
         #Gen audio
+        audio_opt = vcArr[vcIdx].pipeline(
             hubert_model,
             net_g,
             0,
         )
         info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
         print(f"Successful inference with model {name} | {tts_text} | {info}")
+        del net_g, cpt
         return info, (tgt_sr, audio_opt)
     except:
         info = traceback.format_exc()
             "#### <center>Original devs:\n"
             "<center>the RVC Project, lj1995, zomehwh \n\n"
             "#### <center>Model creators:\n"
+            "<center>dacoolkid44, Hijack, Maki Ligon, megaaziib, KitLemonfoot, yeey5, Sui, MahdeenSky, Itaxhix, Acato, Kyuubical, Listra92, IshimaIshimsky, ZomballTH, Jotape91, RigidSpinner, RandomAssBettel, Mimizukari, Oida, Shu-Kun, Nhat Minh, Ardha27, Legitdark, TempoHawk, 0x3e9, Kaiaya, Skeetawn, Sonphantrung, Pianissimo, Gloomwastragic, Sunesu, Aimbo, Act8113, Blyxeen\n"
         )
 if limitation is True:
     app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)

vc_infer_pipeline.py CHANGED Viewed

@@ -13,6 +13,14 @@ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
 input_audio_path2wav = {}
 @lru_cache
 def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
@@ -128,14 +136,14 @@ class VC(object):
             f0[pd < 0.1] = 0
             f0 = f0[0].cpu().numpy()
         elif f0_method == "rmvpe":
-            if hasattr(self, "model_rmvpe") == False:
-                from rmvpe import RMVPE
-                print("loading rmvpe model")
-                self.model_rmvpe = RMVPE(
-                    "rmvpe.pt", is_half=self.is_half, device=self.device
-                )
-            f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
         f0 *= pow(2, f0_up_key / 12)
         # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
         tf0 = self.sr // self.window  # 每秒f0点数
@@ -440,4 +448,4 @@ class VC(object):
         del pitch, pitchf, sid
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        return audio_opt

 input_audio_path2wav = {}
+#Attempting a eagerload of the RMVPE model here.
+from config import Config
+config = Config()
+from rmvpe import RMVPE
+print("Preloading RMVPE model")
+model_rmvpe = RMVPE("rmvpe.pt", is_half=config.is_half, device=config.device)
+del config
 @lru_cache
 def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
             f0[pd < 0.1] = 0
             f0 = f0[0].cpu().numpy()
         elif f0_method == "rmvpe":
+##            if hasattr(self, "model_rmvpe") == False:
+##                from rmvpe import RMVPE
+##
+##                print("loading rmvpe model")
+##                self.model_rmvpe = RMVPE(
+##                    "rmvpe.pt", is_half=self.is_half, device=self.device
+##                )
+            f0 = model_rmvpe.infer_from_audio(x, thred=0.03)
         f0 *= pow(2, f0_up_key / 12)
         # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
         tf0 = self.sr // self.window  # 每秒f0点数
         del pitch, pitchf, sid
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+        return audio_opt