Spaces:
Running
Running
Eddycrack864
commited on
Commit
•
4372559
1
Parent(s):
929a717
Upload 10 files
Browse files- README.md +1 -1
- UVR.py +0 -0
- UVR_interface.py +852 -0
- __version__.py +4 -0
- app.py +3 -0
- packages.txt +3 -0
- requirements.txt +43 -0
- separate.py +942 -0
- webUI.py +285 -0
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: ⚡
|
|
4 |
colorFrom: indigo
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: openrail
|
|
|
4 |
colorFrom: indigo
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.44.2
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: openrail
|
UVR.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
UVR_interface.py
ADDED
@@ -0,0 +1,852 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import audioread
|
2 |
+
import librosa
|
3 |
+
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
import json
|
7 |
+
import time
|
8 |
+
from tqdm import tqdm
|
9 |
+
import pickle
|
10 |
+
import hashlib
|
11 |
+
import logging
|
12 |
+
import traceback
|
13 |
+
import shutil
|
14 |
+
import soundfile as sf
|
15 |
+
|
16 |
+
import torch
|
17 |
+
|
18 |
+
from gui_data.constants import *
|
19 |
+
from gui_data.old_data_check import file_check, remove_unneeded_yamls, remove_temps
|
20 |
+
from lib_v5.vr_network.model_param_init import ModelParameters
|
21 |
+
from lib_v5 import spec_utils
|
22 |
+
from pathlib import Path
|
23 |
+
from separate import SeperateAttributes, SeperateDemucs, SeperateMDX, SeperateVR, save_format
|
24 |
+
from typing import List
|
25 |
+
|
26 |
+
|
27 |
+
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
|
28 |
+
logging.info('UVR BEGIN')
|
29 |
+
|
30 |
+
PREVIOUS_PATCH_WIN = 'UVR_Patch_1_12_23_14_54'
|
31 |
+
|
32 |
+
is_dnd_compatible = True
|
33 |
+
banner_placement = -2
|
34 |
+
|
35 |
+
def save_data(data):
|
36 |
+
"""
|
37 |
+
Saves given data as a .pkl (pickle) file
|
38 |
+
|
39 |
+
Paramters:
|
40 |
+
data(dict):
|
41 |
+
Dictionary containing all the necessary data to save
|
42 |
+
"""
|
43 |
+
# Open data file, create it if it does not exist
|
44 |
+
with open('data.pkl', 'wb') as data_file:
|
45 |
+
pickle.dump(data, data_file)
|
46 |
+
|
47 |
+
def load_data() -> dict:
|
48 |
+
"""
|
49 |
+
Loads saved pkl file and returns the stored data
|
50 |
+
|
51 |
+
Returns(dict):
|
52 |
+
Dictionary containing all the saved data
|
53 |
+
"""
|
54 |
+
try:
|
55 |
+
with open('data.pkl', 'rb') as data_file: # Open data file
|
56 |
+
data = pickle.load(data_file)
|
57 |
+
|
58 |
+
return data
|
59 |
+
except (ValueError, FileNotFoundError):
|
60 |
+
# Data File is corrupted or not found so recreate it
|
61 |
+
|
62 |
+
save_data(data=DEFAULT_DATA)
|
63 |
+
|
64 |
+
return load_data()
|
65 |
+
|
66 |
+
def load_model_hash_data(dictionary):
|
67 |
+
'''Get the model hash dictionary'''
|
68 |
+
|
69 |
+
with open(dictionary) as d:
|
70 |
+
data = d.read()
|
71 |
+
|
72 |
+
return json.loads(data)
|
73 |
+
|
74 |
+
# Change the current working directory to the directory
|
75 |
+
# this file sits in
|
76 |
+
if getattr(sys, 'frozen', False):
|
77 |
+
# If the application is run as a bundle, the PyInstaller bootloader
|
78 |
+
# extends the sys module by a flag frozen=True and sets the app
|
79 |
+
# path into variable _MEIPASS'.
|
80 |
+
BASE_PATH = sys._MEIPASS
|
81 |
+
else:
|
82 |
+
BASE_PATH = os.path.dirname(os.path.abspath(__file__))
|
83 |
+
|
84 |
+
os.chdir(BASE_PATH) # Change the current working directory to the base path
|
85 |
+
|
86 |
+
debugger = []
|
87 |
+
|
88 |
+
#--Constants--
|
89 |
+
#Models
|
90 |
+
MODELS_DIR = os.path.join(BASE_PATH, 'models')
|
91 |
+
VR_MODELS_DIR = os.path.join(MODELS_DIR, 'VR_Models')
|
92 |
+
MDX_MODELS_DIR = os.path.join(MODELS_DIR, 'MDX_Net_Models')
|
93 |
+
DEMUCS_MODELS_DIR = os.path.join(MODELS_DIR, 'Demucs_Models')
|
94 |
+
DEMUCS_NEWER_REPO_DIR = os.path.join(DEMUCS_MODELS_DIR, 'v3_v4_repo')
|
95 |
+
MDX_MIXER_PATH = os.path.join(BASE_PATH, 'lib_v5', 'mixer.ckpt')
|
96 |
+
|
97 |
+
#Cache & Parameters
|
98 |
+
VR_HASH_DIR = os.path.join(VR_MODELS_DIR, 'model_data')
|
99 |
+
VR_HASH_JSON = os.path.join(VR_MODELS_DIR, 'model_data', 'model_data.json')
|
100 |
+
MDX_HASH_DIR = os.path.join(MDX_MODELS_DIR, 'model_data')
|
101 |
+
MDX_HASH_JSON = os.path.join(MDX_MODELS_DIR, 'model_data', 'model_data.json')
|
102 |
+
DEMUCS_MODEL_NAME_SELECT = os.path.join(DEMUCS_MODELS_DIR, 'model_data', 'model_name_mapper.json')
|
103 |
+
MDX_MODEL_NAME_SELECT = os.path.join(MDX_MODELS_DIR, 'model_data', 'model_name_mapper.json')
|
104 |
+
ENSEMBLE_CACHE_DIR = os.path.join(BASE_PATH, 'gui_data', 'saved_ensembles')
|
105 |
+
SETTINGS_CACHE_DIR = os.path.join(BASE_PATH, 'gui_data', 'saved_settings')
|
106 |
+
VR_PARAM_DIR = os.path.join(BASE_PATH, 'lib_v5', 'vr_network', 'modelparams')
|
107 |
+
SAMPLE_CLIP_PATH = os.path.join(BASE_PATH, 'temp_sample_clips')
|
108 |
+
ENSEMBLE_TEMP_PATH = os.path.join(BASE_PATH, 'ensemble_temps')
|
109 |
+
|
110 |
+
#Style
|
111 |
+
ICON_IMG_PATH = os.path.join(BASE_PATH, 'gui_data', 'img', 'GUI-Icon.ico')
|
112 |
+
FONT_PATH = os.path.join(BASE_PATH, 'gui_data', 'fonts', 'centurygothic', 'GOTHIC.TTF')#ensemble_temps
|
113 |
+
|
114 |
+
#Other
|
115 |
+
COMPLETE_CHIME = os.path.join(BASE_PATH, 'gui_data', 'complete_chime.wav')
|
116 |
+
FAIL_CHIME = os.path.join(BASE_PATH, 'gui_data', 'fail_chime.wav')
|
117 |
+
CHANGE_LOG = os.path.join(BASE_PATH, 'gui_data', 'change_log.txt')
|
118 |
+
SPLASH_DOC = os.path.join(BASE_PATH, 'tmp', 'splash.txt')
|
119 |
+
|
120 |
+
file_check(os.path.join(MODELS_DIR, 'Main_Models'), VR_MODELS_DIR)
|
121 |
+
file_check(os.path.join(DEMUCS_MODELS_DIR, 'v3_repo'), DEMUCS_NEWER_REPO_DIR)
|
122 |
+
remove_unneeded_yamls(DEMUCS_MODELS_DIR)
|
123 |
+
|
124 |
+
remove_temps(ENSEMBLE_TEMP_PATH)
|
125 |
+
remove_temps(SAMPLE_CLIP_PATH)
|
126 |
+
remove_temps(os.path.join(BASE_PATH, 'img'))
|
127 |
+
|
128 |
+
if not os.path.isdir(ENSEMBLE_TEMP_PATH):
|
129 |
+
os.mkdir(ENSEMBLE_TEMP_PATH)
|
130 |
+
|
131 |
+
if not os.path.isdir(SAMPLE_CLIP_PATH):
|
132 |
+
os.mkdir(SAMPLE_CLIP_PATH)
|
133 |
+
|
134 |
+
model_hash_table = {}
|
135 |
+
data = load_data()
|
136 |
+
|
137 |
+
class ModelData():
|
138 |
+
def __init__(self, model_name: str,
|
139 |
+
selected_process_method=ENSEMBLE_MODE,
|
140 |
+
is_secondary_model=False,
|
141 |
+
primary_model_primary_stem=None,
|
142 |
+
is_primary_model_primary_stem_only=False,
|
143 |
+
is_primary_model_secondary_stem_only=False,
|
144 |
+
is_pre_proc_model=False,
|
145 |
+
is_dry_check=False):
|
146 |
+
|
147 |
+
self.is_gpu_conversion = 0 if root.is_gpu_conversion_var.get() else -1
|
148 |
+
self.is_normalization = root.is_normalization_var.get()
|
149 |
+
self.is_primary_stem_only = root.is_primary_stem_only_var.get()
|
150 |
+
self.is_secondary_stem_only = root.is_secondary_stem_only_var.get()
|
151 |
+
self.is_denoise = root.is_denoise_var.get()
|
152 |
+
self.mdx_batch_size = 1 if root.mdx_batch_size_var.get() == DEF_OPT else int(root.mdx_batch_size_var.get())
|
153 |
+
self.is_mdx_ckpt = False
|
154 |
+
self.wav_type_set = root.wav_type_set
|
155 |
+
self.mp3_bit_set = root.mp3_bit_set_var.get()
|
156 |
+
self.save_format = root.save_format_var.get()
|
157 |
+
self.is_invert_spec = root.is_invert_spec_var.get()
|
158 |
+
self.is_mixer_mode = root.is_mixer_mode_var.get()
|
159 |
+
self.demucs_stems = root.demucs_stems_var.get()
|
160 |
+
self.demucs_source_list = []
|
161 |
+
self.demucs_stem_count = 0
|
162 |
+
self.mixer_path = MDX_MIXER_PATH
|
163 |
+
self.model_name = model_name
|
164 |
+
self.process_method = selected_process_method
|
165 |
+
self.model_status = False if self.model_name == CHOOSE_MODEL or self.model_name == NO_MODEL else True
|
166 |
+
self.primary_stem = None
|
167 |
+
self.secondary_stem = None
|
168 |
+
self.is_ensemble_mode = False
|
169 |
+
self.ensemble_primary_stem = None
|
170 |
+
self.ensemble_secondary_stem = None
|
171 |
+
self.primary_model_primary_stem = primary_model_primary_stem
|
172 |
+
self.is_secondary_model = is_secondary_model
|
173 |
+
self.secondary_model = None
|
174 |
+
self.secondary_model_scale = None
|
175 |
+
self.demucs_4_stem_added_count = 0
|
176 |
+
self.is_demucs_4_stem_secondaries = False
|
177 |
+
self.is_4_stem_ensemble = False
|
178 |
+
self.pre_proc_model = None
|
179 |
+
self.pre_proc_model_activated = False
|
180 |
+
self.is_pre_proc_model = is_pre_proc_model
|
181 |
+
self.is_dry_check = is_dry_check
|
182 |
+
self.model_samplerate = 44100
|
183 |
+
self.model_capacity = 32, 128
|
184 |
+
self.is_vr_51_model = False
|
185 |
+
self.is_demucs_pre_proc_model_inst_mix = False
|
186 |
+
self.manual_download_Button = None
|
187 |
+
self.secondary_model_4_stem = []
|
188 |
+
self.secondary_model_4_stem_scale = []
|
189 |
+
self.secondary_model_4_stem_names = []
|
190 |
+
self.secondary_model_4_stem_model_names_list = []
|
191 |
+
self.all_models = []
|
192 |
+
self.secondary_model_other = None
|
193 |
+
self.secondary_model_scale_other = None
|
194 |
+
self.secondary_model_bass = None
|
195 |
+
self.secondary_model_scale_bass = None
|
196 |
+
self.secondary_model_drums = None
|
197 |
+
self.secondary_model_scale_drums = None
|
198 |
+
|
199 |
+
if selected_process_method == ENSEMBLE_MODE:
|
200 |
+
partitioned_name = model_name.partition(ENSEMBLE_PARTITION)
|
201 |
+
self.process_method = partitioned_name[0]
|
202 |
+
self.model_name = partitioned_name[2]
|
203 |
+
self.model_and_process_tag = model_name
|
204 |
+
self.ensemble_primary_stem, self.ensemble_secondary_stem = root.return_ensemble_stems()
|
205 |
+
self.is_ensemble_mode = True if not is_secondary_model and not is_pre_proc_model else False
|
206 |
+
self.is_4_stem_ensemble = True if root.ensemble_main_stem_var.get() == FOUR_STEM_ENSEMBLE and self.is_ensemble_mode else False
|
207 |
+
self.pre_proc_model_activated = root.is_demucs_pre_proc_model_activate_var.get() if not self.ensemble_primary_stem == VOCAL_STEM else False
|
208 |
+
|
209 |
+
if self.process_method == VR_ARCH_TYPE:
|
210 |
+
self.is_secondary_model_activated = root.vr_is_secondary_model_activate_var.get() if not self.is_secondary_model else False
|
211 |
+
self.aggression_setting = float(int(root.aggression_setting_var.get())/100)
|
212 |
+
self.is_tta = root.is_tta_var.get()
|
213 |
+
self.is_post_process = root.is_post_process_var.get()
|
214 |
+
self.window_size = int(root.window_size_var.get())
|
215 |
+
self.batch_size = 1 if root.batch_size_var.get() == DEF_OPT else int(root.batch_size_var.get())
|
216 |
+
self.crop_size = int(root.crop_size_var.get())
|
217 |
+
self.is_high_end_process = 'mirroring' if root.is_high_end_process_var.get() else 'None'
|
218 |
+
self.post_process_threshold = float(root.post_process_threshold_var.get())
|
219 |
+
self.model_capacity = 32, 128
|
220 |
+
self.model_path = os.path.join(VR_MODELS_DIR, f"{self.model_name}.pth")
|
221 |
+
self.get_model_hash()
|
222 |
+
if self.model_hash:
|
223 |
+
self.model_data = self.get_model_data(VR_HASH_DIR, root.vr_hash_MAPPER) if not self.model_hash == WOOD_INST_MODEL_HASH else WOOD_INST_PARAMS
|
224 |
+
if self.model_data:
|
225 |
+
vr_model_param = os.path.join(VR_PARAM_DIR, "{}.json".format(self.model_data["vr_model_param"]))
|
226 |
+
self.primary_stem = self.model_data["primary_stem"]
|
227 |
+
self.secondary_stem = STEM_PAIR_MAPPER[self.primary_stem]
|
228 |
+
self.vr_model_param = ModelParameters(vr_model_param)
|
229 |
+
self.model_samplerate = self.vr_model_param.param['sr']
|
230 |
+
if "nout" in self.model_data.keys() and "nout_lstm" in self.model_data.keys():
|
231 |
+
self.model_capacity = self.model_data["nout"], self.model_data["nout_lstm"]
|
232 |
+
self.is_vr_51_model = True
|
233 |
+
else:
|
234 |
+
self.model_status = False
|
235 |
+
|
236 |
+
if self.process_method == MDX_ARCH_TYPE:
|
237 |
+
self.is_secondary_model_activated = root.mdx_is_secondary_model_activate_var.get() if not is_secondary_model else False
|
238 |
+
self.margin = int(root.margin_var.get())
|
239 |
+
self.chunks = root.determine_auto_chunks(root.chunks_var.get(), self.is_gpu_conversion) if root.is_chunk_mdxnet_var.get() else 0
|
240 |
+
self.get_mdx_model_path()
|
241 |
+
self.get_model_hash()
|
242 |
+
if self.model_hash:
|
243 |
+
self.model_data = self.get_model_data(MDX_HASH_DIR, root.mdx_hash_MAPPER)
|
244 |
+
if self.model_data:
|
245 |
+
self.compensate = self.model_data["compensate"] if root.compensate_var.get() == AUTO_SELECT else float(root.compensate_var.get())
|
246 |
+
self.mdx_dim_f_set = self.model_data["mdx_dim_f_set"]
|
247 |
+
self.mdx_dim_t_set = self.model_data["mdx_dim_t_set"]
|
248 |
+
self.mdx_n_fft_scale_set = self.model_data["mdx_n_fft_scale_set"]
|
249 |
+
self.primary_stem = self.model_data["primary_stem"]
|
250 |
+
self.secondary_stem = STEM_PAIR_MAPPER[self.primary_stem]
|
251 |
+
else:
|
252 |
+
self.model_status = False
|
253 |
+
|
254 |
+
if self.process_method == DEMUCS_ARCH_TYPE:
|
255 |
+
self.is_secondary_model_activated = root.demucs_is_secondary_model_activate_var.get() if not is_secondary_model else False
|
256 |
+
if not self.is_ensemble_mode:
|
257 |
+
self.pre_proc_model_activated = root.is_demucs_pre_proc_model_activate_var.get() if not root.demucs_stems_var.get() in [VOCAL_STEM, INST_STEM] else False
|
258 |
+
self.overlap = float(root.overlap_var.get())
|
259 |
+
self.margin_demucs = int(root.margin_demucs_var.get())
|
260 |
+
self.chunks_demucs = root.determine_auto_chunks(root.chunks_demucs_var.get(), self.is_gpu_conversion)
|
261 |
+
self.shifts = int(root.shifts_var.get())
|
262 |
+
self.is_split_mode = root.is_split_mode_var.get()
|
263 |
+
self.segment = root.segment_var.get()
|
264 |
+
self.is_chunk_demucs = root.is_chunk_demucs_var.get()
|
265 |
+
self.is_demucs_combine_stems = root.is_demucs_combine_stems_var.get()
|
266 |
+
self.is_primary_stem_only = root.is_primary_stem_only_var.get() if self.is_ensemble_mode else root.is_primary_stem_only_Demucs_var.get()
|
267 |
+
self.is_secondary_stem_only = root.is_secondary_stem_only_var.get() if self.is_ensemble_mode else root.is_secondary_stem_only_Demucs_var.get()
|
268 |
+
self.get_demucs_model_path()
|
269 |
+
self.get_demucs_model_data()
|
270 |
+
|
271 |
+
self.model_basename = os.path.splitext(os.path.basename(self.model_path))[0] if self.model_status else None
|
272 |
+
self.pre_proc_model_activated = self.pre_proc_model_activated if not self.is_secondary_model else False
|
273 |
+
|
274 |
+
self.is_primary_model_primary_stem_only = is_primary_model_primary_stem_only
|
275 |
+
self.is_primary_model_secondary_stem_only = is_primary_model_secondary_stem_only
|
276 |
+
|
277 |
+
if self.is_secondary_model_activated and self.model_status:
|
278 |
+
if (not self.is_ensemble_mode and root.demucs_stems_var.get() == ALL_STEMS and self.process_method == DEMUCS_ARCH_TYPE) or self.is_4_stem_ensemble:
|
279 |
+
for key in DEMUCS_4_SOURCE_LIST:
|
280 |
+
self.secondary_model_data(key)
|
281 |
+
self.secondary_model_4_stem.append(self.secondary_model)
|
282 |
+
self.secondary_model_4_stem_scale.append(self.secondary_model_scale)
|
283 |
+
self.secondary_model_4_stem_names.append(key)
|
284 |
+
self.demucs_4_stem_added_count = sum(i is not None for i in self.secondary_model_4_stem)
|
285 |
+
self.is_secondary_model_activated = False if all(i is None for i in self.secondary_model_4_stem) else True
|
286 |
+
self.demucs_4_stem_added_count = self.demucs_4_stem_added_count - 1 if self.is_secondary_model_activated else self.demucs_4_stem_added_count
|
287 |
+
if self.is_secondary_model_activated:
|
288 |
+
self.secondary_model_4_stem_model_names_list = [None if i is None else i.model_basename for i in self.secondary_model_4_stem]
|
289 |
+
self.is_demucs_4_stem_secondaries = True
|
290 |
+
else:
|
291 |
+
primary_stem = self.ensemble_primary_stem if self.is_ensemble_mode and self.process_method == DEMUCS_ARCH_TYPE else self.primary_stem
|
292 |
+
self.secondary_model_data(primary_stem)
|
293 |
+
|
294 |
+
if self.process_method == DEMUCS_ARCH_TYPE and not is_secondary_model:
|
295 |
+
if self.demucs_stem_count >= 3 and self.pre_proc_model_activated:
|
296 |
+
self.pre_proc_model_activated = True
|
297 |
+
self.pre_proc_model = root.process_determine_demucs_pre_proc_model(self.primary_stem)
|
298 |
+
self.is_demucs_pre_proc_model_inst_mix = root.is_demucs_pre_proc_model_inst_mix_var.get() if self.pre_proc_model else False
|
299 |
+
|
300 |
+
def secondary_model_data(self, primary_stem):
|
301 |
+
secondary_model_data = root.process_determine_secondary_model(self.process_method, primary_stem, self.is_primary_stem_only, self.is_secondary_stem_only)
|
302 |
+
self.secondary_model = secondary_model_data[0]
|
303 |
+
self.secondary_model_scale = secondary_model_data[1]
|
304 |
+
self.is_secondary_model_activated = False if not self.secondary_model else True
|
305 |
+
if self.secondary_model:
|
306 |
+
self.is_secondary_model_activated = False if self.secondary_model.model_basename == self.model_basename else True
|
307 |
+
|
308 |
+
def get_mdx_model_path(self):
|
309 |
+
|
310 |
+
if self.model_name.endswith(CKPT):
|
311 |
+
# self.chunks = 0
|
312 |
+
# self.is_mdx_batch_mode = True
|
313 |
+
self.is_mdx_ckpt = True
|
314 |
+
|
315 |
+
ext = '' if self.is_mdx_ckpt else ONNX
|
316 |
+
|
317 |
+
for file_name, chosen_mdx_model in root.mdx_name_select_MAPPER.items():
|
318 |
+
if self.model_name in chosen_mdx_model:
|
319 |
+
self.model_path = os.path.join(MDX_MODELS_DIR, f"{file_name}{ext}")
|
320 |
+
break
|
321 |
+
else:
|
322 |
+
self.model_path = os.path.join(MDX_MODELS_DIR, f"{self.model_name}{ext}")
|
323 |
+
|
324 |
+
self.mixer_path = os.path.join(MDX_MODELS_DIR, f"mixer_val.ckpt")
|
325 |
+
|
326 |
+
def get_demucs_model_path(self):
|
327 |
+
|
328 |
+
demucs_newer = [True for x in DEMUCS_NEWER_TAGS if x in self.model_name]
|
329 |
+
demucs_model_dir = DEMUCS_NEWER_REPO_DIR if demucs_newer else DEMUCS_MODELS_DIR
|
330 |
+
|
331 |
+
for file_name, chosen_model in root.demucs_name_select_MAPPER.items():
|
332 |
+
if self.model_name in chosen_model:
|
333 |
+
self.model_path = os.path.join(demucs_model_dir, file_name)
|
334 |
+
break
|
335 |
+
else:
|
336 |
+
self.model_path = os.path.join(DEMUCS_NEWER_REPO_DIR, f'{self.model_name}.yaml')
|
337 |
+
|
338 |
+
def get_demucs_model_data(self):
|
339 |
+
|
340 |
+
self.demucs_version = DEMUCS_V4
|
341 |
+
|
342 |
+
for key, value in DEMUCS_VERSION_MAPPER.items():
|
343 |
+
if value in self.model_name:
|
344 |
+
self.demucs_version = key
|
345 |
+
|
346 |
+
self.demucs_source_list = DEMUCS_2_SOURCE if DEMUCS_UVR_MODEL in self.model_name else DEMUCS_4_SOURCE
|
347 |
+
self.demucs_source_map = DEMUCS_2_SOURCE_MAPPER if DEMUCS_UVR_MODEL in self.model_name else DEMUCS_4_SOURCE_MAPPER
|
348 |
+
self.demucs_stem_count = 2 if DEMUCS_UVR_MODEL in self.model_name else 4
|
349 |
+
|
350 |
+
if not self.is_ensemble_mode:
|
351 |
+
self.primary_stem = PRIMARY_STEM if self.demucs_stems == ALL_STEMS else self.demucs_stems
|
352 |
+
self.secondary_stem = STEM_PAIR_MAPPER[self.primary_stem]
|
353 |
+
|
354 |
+
def get_model_data(self, model_hash_dir, hash_mapper):
|
355 |
+
model_settings_json = os.path.join(model_hash_dir, "{}.json".format(self.model_hash))
|
356 |
+
|
357 |
+
if os.path.isfile(model_settings_json):
|
358 |
+
return json.load(open(model_settings_json))
|
359 |
+
else:
|
360 |
+
for hash, settings in hash_mapper.items():
|
361 |
+
if self.model_hash in hash:
|
362 |
+
return settings
|
363 |
+
else:
|
364 |
+
return self.get_model_data_from_popup()
|
365 |
+
|
366 |
+
def get_model_data_from_popup(self):
|
367 |
+
return None
|
368 |
+
|
369 |
+
def get_model_hash(self):
|
370 |
+
self.model_hash = None
|
371 |
+
|
372 |
+
if not os.path.isfile(self.model_path):
|
373 |
+
self.model_status = False
|
374 |
+
self.model_hash is None
|
375 |
+
else:
|
376 |
+
if model_hash_table:
|
377 |
+
for (key, value) in model_hash_table.items():
|
378 |
+
if self.model_path == key:
|
379 |
+
self.model_hash = value
|
380 |
+
break
|
381 |
+
|
382 |
+
if not self.model_hash:
|
383 |
+
try:
|
384 |
+
with open(self.model_path, 'rb') as f:
|
385 |
+
f.seek(- 10000 * 1024, 2)
|
386 |
+
self.model_hash = hashlib.md5(f.read()).hexdigest()
|
387 |
+
except:
|
388 |
+
self.model_hash = hashlib.md5(open(self.model_path,'rb').read()).hexdigest()
|
389 |
+
|
390 |
+
table_entry = {self.model_path: self.model_hash}
|
391 |
+
model_hash_table.update(table_entry)
|
392 |
+
|
393 |
+
|
394 |
+
class Ensembler():
|
395 |
+
def __init__(self, is_manual_ensemble=False):
|
396 |
+
self.is_save_all_outputs_ensemble = root.is_save_all_outputs_ensemble_var.get()
|
397 |
+
chosen_ensemble_name = '{}'.format(root.chosen_ensemble_var.get().replace(" ", "_")) if not root.chosen_ensemble_var.get() == CHOOSE_ENSEMBLE_OPTION else 'Ensembled'
|
398 |
+
ensemble_algorithm = root.ensemble_type_var.get().partition("/")
|
399 |
+
ensemble_main_stem_pair = root.ensemble_main_stem_var.get().partition("/")
|
400 |
+
time_stamp = round(time.time())
|
401 |
+
self.audio_tool = MANUAL_ENSEMBLE
|
402 |
+
self.main_export_path = Path(root.export_path_var.get())
|
403 |
+
self.chosen_ensemble = f"_{chosen_ensemble_name}" if root.is_append_ensemble_name_var.get() else ''
|
404 |
+
ensemble_folder_name = self.main_export_path if self.is_save_all_outputs_ensemble else ENSEMBLE_TEMP_PATH
|
405 |
+
self.ensemble_folder_name = os.path.join(ensemble_folder_name, '{}_Outputs_{}'.format(chosen_ensemble_name, time_stamp))
|
406 |
+
self.is_testing_audio = f"{time_stamp}_" if root.is_testing_audio_var.get() else ''
|
407 |
+
self.primary_algorithm = ensemble_algorithm[0]
|
408 |
+
self.secondary_algorithm = ensemble_algorithm[2]
|
409 |
+
self.ensemble_primary_stem = ensemble_main_stem_pair[0]
|
410 |
+
self.ensemble_secondary_stem = ensemble_main_stem_pair[2]
|
411 |
+
self.is_normalization = root.is_normalization_var.get()
|
412 |
+
self.wav_type_set = root.wav_type_set
|
413 |
+
self.mp3_bit_set = root.mp3_bit_set_var.get()
|
414 |
+
self.save_format = root.save_format_var.get()
|
415 |
+
if not is_manual_ensemble:
|
416 |
+
os.mkdir(self.ensemble_folder_name)
|
417 |
+
|
418 |
+
def ensemble_outputs(self, audio_file_base, export_path, stem, is_4_stem=False, is_inst_mix=False):
|
419 |
+
"""Processes the given outputs and ensembles them with the chosen algorithm"""
|
420 |
+
|
421 |
+
if is_4_stem:
|
422 |
+
algorithm = root.ensemble_type_var.get()
|
423 |
+
stem_tag = stem
|
424 |
+
else:
|
425 |
+
if is_inst_mix:
|
426 |
+
algorithm = self.secondary_algorithm
|
427 |
+
stem_tag = f"{self.ensemble_secondary_stem} {INST_STEM}"
|
428 |
+
else:
|
429 |
+
algorithm = self.primary_algorithm if stem == PRIMARY_STEM else self.secondary_algorithm
|
430 |
+
stem_tag = self.ensemble_primary_stem if stem == PRIMARY_STEM else self.ensemble_secondary_stem
|
431 |
+
|
432 |
+
stem_outputs = self.get_files_to_ensemble(folder=export_path, prefix=audio_file_base, suffix=f"_({stem_tag}).wav")
|
433 |
+
audio_file_output = f"{self.is_testing_audio}{audio_file_base}{self.chosen_ensemble}_({stem_tag})"
|
434 |
+
stem_save_path = os.path.join('{}'.format(self.main_export_path),'{}.wav'.format(audio_file_output))
|
435 |
+
|
436 |
+
if stem_outputs:
|
437 |
+
spec_utils.ensemble_inputs(stem_outputs, algorithm, self.is_normalization, self.wav_type_set, stem_save_path)
|
438 |
+
save_format(stem_save_path, self.save_format, self.mp3_bit_set)
|
439 |
+
|
440 |
+
if self.is_save_all_outputs_ensemble:
|
441 |
+
for i in stem_outputs:
|
442 |
+
save_format(i, self.save_format, self.mp3_bit_set)
|
443 |
+
else:
|
444 |
+
for i in stem_outputs:
|
445 |
+
try:
|
446 |
+
os.remove(i)
|
447 |
+
except Exception as e:
|
448 |
+
print(e)
|
449 |
+
|
450 |
+
def ensemble_manual(self, audio_inputs, audio_file_base, is_bulk=False):
|
451 |
+
"""Processes the given outputs and ensembles them with the chosen algorithm"""
|
452 |
+
|
453 |
+
is_mv_sep = True
|
454 |
+
|
455 |
+
if is_bulk:
|
456 |
+
number_list = list(set([os.path.basename(i).split("_")[0] for i in audio_inputs]))
|
457 |
+
for n in number_list:
|
458 |
+
current_list = [i for i in audio_inputs if os.path.basename(i).startswith(n)]
|
459 |
+
audio_file_base = os.path.basename(current_list[0]).split('.wav')[0]
|
460 |
+
stem_testing = "instrum" if "Instrumental" in audio_file_base else "vocals"
|
461 |
+
if is_mv_sep:
|
462 |
+
audio_file_base = audio_file_base.split("_")
|
463 |
+
audio_file_base = f"{audio_file_base[1]}_{audio_file_base[2]}_{stem_testing}"
|
464 |
+
self.ensemble_manual_process(current_list, audio_file_base, is_bulk)
|
465 |
+
else:
|
466 |
+
self.ensemble_manual_process(audio_inputs, audio_file_base, is_bulk)
|
467 |
+
|
468 |
+
def ensemble_manual_process(self, audio_inputs, audio_file_base, is_bulk):
|
469 |
+
|
470 |
+
algorithm = root.choose_algorithm_var.get()
|
471 |
+
algorithm_text = "" if is_bulk else f"_({root.choose_algorithm_var.get()})"
|
472 |
+
stem_save_path = os.path.join('{}'.format(self.main_export_path),'{}{}{}.wav'.format(self.is_testing_audio, audio_file_base, algorithm_text))
|
473 |
+
spec_utils.ensemble_inputs(audio_inputs, algorithm, self.is_normalization, self.wav_type_set, stem_save_path)
|
474 |
+
save_format(stem_save_path, self.save_format, self.mp3_bit_set)
|
475 |
+
|
476 |
+
def get_files_to_ensemble(self, folder="", prefix="", suffix=""):
|
477 |
+
"""Grab all the files to be ensembled"""
|
478 |
+
|
479 |
+
return [os.path.join(folder, i) for i in os.listdir(folder) if i.startswith(prefix) and i.endswith(suffix)]
|
480 |
+
|
481 |
+
|
482 |
+
def secondary_stem(stem):
|
483 |
+
"""Determines secondary stem"""
|
484 |
+
|
485 |
+
for key, value in STEM_PAIR_MAPPER.items():
|
486 |
+
if stem in key:
|
487 |
+
secondary_stem = value
|
488 |
+
|
489 |
+
return secondary_stem
|
490 |
+
|
491 |
+
|
492 |
+
class UVRInterface:
|
493 |
+
def __init__(self) -> None:
|
494 |
+
pass
|
495 |
+
|
496 |
+
def assemble_model_data(self, model=None, arch_type=ENSEMBLE_MODE, is_dry_check=False) -> List[ModelData]:
|
497 |
+
if arch_type == ENSEMBLE_STEM_CHECK:
|
498 |
+
model_data = self.model_data_table
|
499 |
+
missing_models = [model.model_status for model in model_data if not model.model_status]
|
500 |
+
|
501 |
+
if missing_models or not model_data:
|
502 |
+
model_data: List[ModelData] = [ModelData(model_name, is_dry_check=is_dry_check) for model_name in self.ensemble_model_list]
|
503 |
+
self.model_data_table = model_data
|
504 |
+
|
505 |
+
if arch_type == ENSEMBLE_MODE:
|
506 |
+
model_data: List[ModelData] = [ModelData(model_name) for model_name in self.ensemble_listbox_get_all_selected_models()]
|
507 |
+
if arch_type == ENSEMBLE_CHECK:
|
508 |
+
model_data: List[ModelData] = [ModelData(model)]
|
509 |
+
if arch_type == VR_ARCH_TYPE or arch_type == VR_ARCH_PM:
|
510 |
+
model_data: List[ModelData] = [ModelData(model, VR_ARCH_TYPE)]
|
511 |
+
if arch_type == MDX_ARCH_TYPE:
|
512 |
+
model_data: List[ModelData] = [ModelData(model, MDX_ARCH_TYPE)]
|
513 |
+
if arch_type == DEMUCS_ARCH_TYPE:
|
514 |
+
model_data: List[ModelData] = [ModelData(model, DEMUCS_ARCH_TYPE)]#
|
515 |
+
|
516 |
+
return model_data
|
517 |
+
|
518 |
+
def create_sample(self, audio_file, sample_path=SAMPLE_CLIP_PATH):
|
519 |
+
try:
|
520 |
+
with audioread.audio_open(audio_file) as f:
|
521 |
+
track_length = int(f.duration)
|
522 |
+
except Exception as e:
|
523 |
+
print('Audioread failed to get duration. Trying Librosa...')
|
524 |
+
y, sr = librosa.load(audio_file, mono=False, sr=44100)
|
525 |
+
track_length = int(librosa.get_duration(y=y, sr=sr))
|
526 |
+
|
527 |
+
clip_duration = int(root.model_sample_mode_duration_var.get())
|
528 |
+
|
529 |
+
if track_length >= clip_duration:
|
530 |
+
offset_cut = track_length//3
|
531 |
+
off_cut = offset_cut + track_length
|
532 |
+
if not off_cut >= clip_duration:
|
533 |
+
offset_cut = 0
|
534 |
+
name_apped = f'{clip_duration}_second_'
|
535 |
+
else:
|
536 |
+
offset_cut, clip_duration = 0, track_length
|
537 |
+
name_apped = ''
|
538 |
+
|
539 |
+
sample = librosa.load(audio_file, offset=offset_cut, duration=clip_duration, mono=False, sr=44100)[0].T
|
540 |
+
audio_sample = os.path.join(sample_path, f'{os.path.splitext(os.path.basename(audio_file))[0]}_{name_apped}sample.wav')
|
541 |
+
sf.write(audio_sample, sample, 44100)
|
542 |
+
|
543 |
+
return audio_sample
|
544 |
+
|
545 |
+
def verify_audio(self, audio_file, is_process=True, sample_path=None):
|
546 |
+
is_good = False
|
547 |
+
error_data = ''
|
548 |
+
|
549 |
+
if os.path.isfile(audio_file):
|
550 |
+
try:
|
551 |
+
librosa.load(audio_file, duration=3, mono=False, sr=44100) if not type(sample_path) is str else self.create_sample(audio_file, sample_path)
|
552 |
+
is_good = True
|
553 |
+
except Exception as e:
|
554 |
+
error_name = f'{type(e).__name__}'
|
555 |
+
traceback_text = ''.join(traceback.format_tb(e.__traceback__))
|
556 |
+
message = f'{error_name}: "{e}"\n{traceback_text}"'
|
557 |
+
if is_process:
|
558 |
+
audio_base_name = os.path.basename(audio_file)
|
559 |
+
self.error_log_var.set(f'Error Loading the Following File:\n\n\"{audio_base_name}\"\n\nRaw Error Details:\n\n{message}')
|
560 |
+
else:
|
561 |
+
error_data = AUDIO_VERIFICATION_CHECK(audio_file, message)
|
562 |
+
|
563 |
+
if is_process:
|
564 |
+
return is_good
|
565 |
+
else:
|
566 |
+
return is_good, error_data
|
567 |
+
|
568 |
+
def cached_sources_clear(self):
|
569 |
+
self.vr_cache_source_mapper = {}
|
570 |
+
self.mdx_cache_source_mapper = {}
|
571 |
+
self.demucs_cache_source_mapper = {}
|
572 |
+
|
573 |
+
def cached_model_source_holder(self, process_method, sources, model_name=None):
|
574 |
+
if process_method == VR_ARCH_TYPE:
|
575 |
+
self.vr_cache_source_mapper = {**self.vr_cache_source_mapper, **{model_name: sources}}
|
576 |
+
if process_method == MDX_ARCH_TYPE:
|
577 |
+
self.mdx_cache_source_mapper = {**self.mdx_cache_source_mapper, **{model_name: sources}}
|
578 |
+
if process_method == DEMUCS_ARCH_TYPE:
|
579 |
+
self.demucs_cache_source_mapper = {**self.demucs_cache_source_mapper, **{model_name: sources}}
|
580 |
+
|
581 |
+
def cached_source_callback(self, process_method, model_name=None):
|
582 |
+
model, sources = None, None
|
583 |
+
|
584 |
+
if process_method == VR_ARCH_TYPE:
|
585 |
+
mapper = self.vr_cache_source_mapper
|
586 |
+
if process_method == MDX_ARCH_TYPE:
|
587 |
+
mapper = self.mdx_cache_source_mapper
|
588 |
+
if process_method == DEMUCS_ARCH_TYPE:
|
589 |
+
mapper = self.demucs_cache_source_mapper
|
590 |
+
|
591 |
+
for key, value in mapper.items():
|
592 |
+
if model_name in key:
|
593 |
+
model = key
|
594 |
+
sources = value
|
595 |
+
|
596 |
+
return model, sources
|
597 |
+
|
598 |
+
def cached_source_model_list_check(self, model_list: List[ModelData]):
|
599 |
+
model: ModelData
|
600 |
+
primary_model_names = lambda process_method:[model.model_basename if model.process_method == process_method else None for model in model_list]
|
601 |
+
secondary_model_names = lambda process_method:[model.secondary_model.model_basename if model.is_secondary_model_activated and model.process_method == process_method else None for model in model_list]
|
602 |
+
|
603 |
+
self.vr_primary_model_names = primary_model_names(VR_ARCH_TYPE)
|
604 |
+
self.mdx_primary_model_names = primary_model_names(MDX_ARCH_TYPE)
|
605 |
+
self.demucs_primary_model_names = primary_model_names(DEMUCS_ARCH_TYPE)
|
606 |
+
self.vr_secondary_model_names = secondary_model_names(VR_ARCH_TYPE)
|
607 |
+
self.mdx_secondary_model_names = secondary_model_names(MDX_ARCH_TYPE)
|
608 |
+
self.demucs_secondary_model_names = [model.secondary_model.model_basename if model.is_secondary_model_activated and model.process_method == DEMUCS_ARCH_TYPE and not model.secondary_model is None else None for model in model_list]
|
609 |
+
self.demucs_pre_proc_model_name = [model.pre_proc_model.model_basename if model.pre_proc_model else None for model in model_list]#list(dict.fromkeys())
|
610 |
+
|
611 |
+
for model in model_list:
|
612 |
+
if model.process_method == DEMUCS_ARCH_TYPE and model.is_demucs_4_stem_secondaries:
|
613 |
+
if not model.is_4_stem_ensemble:
|
614 |
+
self.demucs_secondary_model_names = model.secondary_model_4_stem_model_names_list
|
615 |
+
break
|
616 |
+
else:
|
617 |
+
for i in model.secondary_model_4_stem_model_names_list:
|
618 |
+
self.demucs_secondary_model_names.append(i)
|
619 |
+
|
620 |
+
self.all_models = self.vr_primary_model_names + self.mdx_primary_model_names + self.demucs_primary_model_names + self.vr_secondary_model_names + self.mdx_secondary_model_names + self.demucs_secondary_model_names + self.demucs_pre_proc_model_name
|
621 |
+
|
622 |
+
def process(self, model_name, arch_type, audio_file, export_path, is_model_sample_mode=False, is_4_stem_ensemble=False, set_progress_func=None, console_write=print) -> SeperateAttributes:
|
623 |
+
stime = time.perf_counter()
|
624 |
+
time_elapsed = lambda:f'Time Elapsed: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - stime)))}'
|
625 |
+
|
626 |
+
if arch_type==ENSEMBLE_MODE:
|
627 |
+
model_list, ensemble = self.assemble_model_data(), Ensembler()
|
628 |
+
export_path = ensemble.ensemble_folder_name
|
629 |
+
is_ensemble = True
|
630 |
+
else:
|
631 |
+
model_list = self.assemble_model_data(model_name, arch_type)
|
632 |
+
is_ensemble = False
|
633 |
+
self.cached_source_model_list_check(model_list)
|
634 |
+
model = model_list[0]
|
635 |
+
|
636 |
+
if self.verify_audio(audio_file):
|
637 |
+
audio_file = self.create_sample(audio_file) if is_model_sample_mode else audio_file
|
638 |
+
else:
|
639 |
+
print(f'"{os.path.basename(audio_file)}\" is missing or currupted.\n')
|
640 |
+
exit()
|
641 |
+
|
642 |
+
audio_file_base = f"{os.path.splitext(os.path.basename(audio_file))[0]}"
|
643 |
+
audio_file_base = audio_file_base if is_ensemble else f"{round(time.time())}_{audio_file_base}"
|
644 |
+
audio_file_base = audio_file_base if not is_ensemble else f"{audio_file_base}_{model.model_basename}"
|
645 |
+
if not is_ensemble:
|
646 |
+
audio_file_base = f"{audio_file_base}_{model.model_basename}"
|
647 |
+
|
648 |
+
if not is_ensemble:
|
649 |
+
export_path = os.path.join(Path(export_path), model.model_basename, os.path.splitext(os.path.basename(audio_file))[0])
|
650 |
+
if not os.path.isdir(export_path):
|
651 |
+
os.makedirs(export_path)
|
652 |
+
|
653 |
+
if set_progress_func is None:
|
654 |
+
pbar = tqdm(total=1)
|
655 |
+
self._progress = 0
|
656 |
+
def set_progress_func(step, inference_iterations=0):
|
657 |
+
progress_curr = step + inference_iterations
|
658 |
+
pbar.update(progress_curr-self._progress)
|
659 |
+
self._progress = progress_curr
|
660 |
+
|
661 |
+
def postprocess():
|
662 |
+
pbar.close()
|
663 |
+
else:
|
664 |
+
def postprocess():
|
665 |
+
pass
|
666 |
+
|
667 |
+
process_data = {
|
668 |
+
'model_data': model,
|
669 |
+
'export_path': export_path,
|
670 |
+
'audio_file_base': audio_file_base,
|
671 |
+
'audio_file': audio_file,
|
672 |
+
'set_progress_bar': set_progress_func,
|
673 |
+
'write_to_console': lambda progress_text, base_text='': console_write(base_text + progress_text),
|
674 |
+
'process_iteration': lambda:None,
|
675 |
+
'cached_source_callback': self.cached_source_callback,
|
676 |
+
'cached_model_source_holder': self.cached_model_source_holder,
|
677 |
+
'list_all_models': self.all_models,
|
678 |
+
'is_ensemble_master': is_ensemble,
|
679 |
+
'is_4_stem_ensemble': is_ensemble and is_4_stem_ensemble
|
680 |
+
}
|
681 |
+
if model.process_method == VR_ARCH_TYPE:
|
682 |
+
seperator = SeperateVR(model, process_data)
|
683 |
+
if model.process_method == MDX_ARCH_TYPE:
|
684 |
+
seperator = SeperateMDX(model, process_data)
|
685 |
+
if model.process_method == DEMUCS_ARCH_TYPE:
|
686 |
+
seperator = SeperateDemucs(model, process_data)
|
687 |
+
|
688 |
+
seperator.seperate()
|
689 |
+
postprocess()
|
690 |
+
|
691 |
+
if is_ensemble:
|
692 |
+
audio_file_base = audio_file_base.replace(f"_{model.model_basename}", "")
|
693 |
+
console_write(ENSEMBLING_OUTPUTS)
|
694 |
+
|
695 |
+
if is_4_stem_ensemble:
|
696 |
+
for output_stem in DEMUCS_4_SOURCE_LIST:
|
697 |
+
ensemble.ensemble_outputs(audio_file_base, export_path, output_stem, is_4_stem=True)
|
698 |
+
else:
|
699 |
+
if not root.is_secondary_stem_only_var.get():
|
700 |
+
ensemble.ensemble_outputs(audio_file_base, export_path, PRIMARY_STEM)
|
701 |
+
if not root.is_primary_stem_only_var.get():
|
702 |
+
ensemble.ensemble_outputs(audio_file_base, export_path, SECONDARY_STEM)
|
703 |
+
ensemble.ensemble_outputs(audio_file_base, export_path, SECONDARY_STEM, is_inst_mix=True)
|
704 |
+
|
705 |
+
console_write(DONE)
|
706 |
+
|
707 |
+
if is_model_sample_mode:
|
708 |
+
if os.path.isfile(audio_file):
|
709 |
+
os.remove(audio_file)
|
710 |
+
|
711 |
+
torch.cuda.empty_cache()
|
712 |
+
|
713 |
+
if is_ensemble and len(os.listdir(export_path)) == 0:
|
714 |
+
shutil.rmtree(export_path)
|
715 |
+
console_write(f'Process Complete, using time: {time_elapsed()}\nOutput path: {export_path}')
|
716 |
+
self.cached_sources_clear()
|
717 |
+
return seperator
|
718 |
+
|
719 |
+
|
720 |
+
class RootWrapper:
|
721 |
+
def __init__(self, var) -> None:
|
722 |
+
self.var=var
|
723 |
+
|
724 |
+
def set(self, val):
|
725 |
+
self.var=val
|
726 |
+
|
727 |
+
def get(self):
|
728 |
+
return self.var
|
729 |
+
|
730 |
+
class FakeRoot:
|
731 |
+
def __init__(self) -> None:
|
732 |
+
self.wav_type_set = 'PCM_16'
|
733 |
+
self.vr_hash_MAPPER = load_model_hash_data(VR_HASH_JSON)
|
734 |
+
self.mdx_hash_MAPPER = load_model_hash_data(MDX_HASH_JSON)
|
735 |
+
self.mdx_name_select_MAPPER = load_model_hash_data(MDX_MODEL_NAME_SELECT)
|
736 |
+
self.demucs_name_select_MAPPER = load_model_hash_data(DEMUCS_MODEL_NAME_SELECT)
|
737 |
+
|
738 |
+
def __getattribute__(self, __name: str):
|
739 |
+
try:
|
740 |
+
return super().__getattribute__(__name)
|
741 |
+
except AttributeError:
|
742 |
+
wrapped=RootWrapper(None)
|
743 |
+
super().__setattr__(__name, wrapped)
|
744 |
+
return wrapped
|
745 |
+
|
746 |
+
def load_saved_settings(self, loaded_setting: dict, process_method=None):
|
747 |
+
"""Loads user saved application settings or resets to default"""
|
748 |
+
|
749 |
+
for key, value in DEFAULT_DATA.items():
|
750 |
+
if not key in loaded_setting.keys():
|
751 |
+
loaded_setting = {**loaded_setting, **{key:value}}
|
752 |
+
loaded_setting['batch_size'] = DEF_OPT
|
753 |
+
|
754 |
+
is_ensemble = True if process_method == ENSEMBLE_MODE else False
|
755 |
+
|
756 |
+
if not process_method or process_method == VR_ARCH_PM or is_ensemble:
|
757 |
+
self.vr_model_var.set(loaded_setting['vr_model'])
|
758 |
+
self.aggression_setting_var.set(loaded_setting['aggression_setting'])
|
759 |
+
self.window_size_var.set(loaded_setting['window_size'])
|
760 |
+
self.batch_size_var.set(loaded_setting['batch_size'])
|
761 |
+
self.crop_size_var.set(loaded_setting['crop_size'])
|
762 |
+
self.is_tta_var.set(loaded_setting['is_tta'])
|
763 |
+
self.is_output_image_var.set(loaded_setting['is_output_image'])
|
764 |
+
self.is_post_process_var.set(loaded_setting['is_post_process'])
|
765 |
+
self.is_high_end_process_var.set(loaded_setting['is_high_end_process'])
|
766 |
+
self.post_process_threshold_var.set(loaded_setting['post_process_threshold'])
|
767 |
+
self.vr_voc_inst_secondary_model_var.set(loaded_setting['vr_voc_inst_secondary_model'])
|
768 |
+
self.vr_other_secondary_model_var.set(loaded_setting['vr_other_secondary_model'])
|
769 |
+
self.vr_bass_secondary_model_var.set(loaded_setting['vr_bass_secondary_model'])
|
770 |
+
self.vr_drums_secondary_model_var.set(loaded_setting['vr_drums_secondary_model'])
|
771 |
+
self.vr_is_secondary_model_activate_var.set(loaded_setting['vr_is_secondary_model_activate'])
|
772 |
+
self.vr_voc_inst_secondary_model_scale_var.set(loaded_setting['vr_voc_inst_secondary_model_scale'])
|
773 |
+
self.vr_other_secondary_model_scale_var.set(loaded_setting['vr_other_secondary_model_scale'])
|
774 |
+
self.vr_bass_secondary_model_scale_var.set(loaded_setting['vr_bass_secondary_model_scale'])
|
775 |
+
self.vr_drums_secondary_model_scale_var.set(loaded_setting['vr_drums_secondary_model_scale'])
|
776 |
+
|
777 |
+
if not process_method or process_method == DEMUCS_ARCH_TYPE or is_ensemble:
|
778 |
+
self.demucs_model_var.set(loaded_setting['demucs_model'])
|
779 |
+
self.segment_var.set(loaded_setting['segment'])
|
780 |
+
self.overlap_var.set(loaded_setting['overlap'])
|
781 |
+
self.shifts_var.set(loaded_setting['shifts'])
|
782 |
+
self.chunks_demucs_var.set(loaded_setting['chunks_demucs'])
|
783 |
+
self.margin_demucs_var.set(loaded_setting['margin_demucs'])
|
784 |
+
self.is_chunk_demucs_var.set(loaded_setting['is_chunk_demucs'])
|
785 |
+
self.is_chunk_mdxnet_var.set(loaded_setting['is_chunk_mdxnet'])
|
786 |
+
self.is_primary_stem_only_Demucs_var.set(loaded_setting['is_primary_stem_only_Demucs'])
|
787 |
+
self.is_secondary_stem_only_Demucs_var.set(loaded_setting['is_secondary_stem_only_Demucs'])
|
788 |
+
self.is_split_mode_var.set(loaded_setting['is_split_mode'])
|
789 |
+
self.is_demucs_combine_stems_var.set(loaded_setting['is_demucs_combine_stems'])
|
790 |
+
self.demucs_voc_inst_secondary_model_var.set(loaded_setting['demucs_voc_inst_secondary_model'])
|
791 |
+
self.demucs_other_secondary_model_var.set(loaded_setting['demucs_other_secondary_model'])
|
792 |
+
self.demucs_bass_secondary_model_var.set(loaded_setting['demucs_bass_secondary_model'])
|
793 |
+
self.demucs_drums_secondary_model_var.set(loaded_setting['demucs_drums_secondary_model'])
|
794 |
+
self.demucs_is_secondary_model_activate_var.set(loaded_setting['demucs_is_secondary_model_activate'])
|
795 |
+
self.demucs_voc_inst_secondary_model_scale_var.set(loaded_setting['demucs_voc_inst_secondary_model_scale'])
|
796 |
+
self.demucs_other_secondary_model_scale_var.set(loaded_setting['demucs_other_secondary_model_scale'])
|
797 |
+
self.demucs_bass_secondary_model_scale_var.set(loaded_setting['demucs_bass_secondary_model_scale'])
|
798 |
+
self.demucs_drums_secondary_model_scale_var.set(loaded_setting['demucs_drums_secondary_model_scale'])
|
799 |
+
self.demucs_stems_var.set(loaded_setting['demucs_stems'])
|
800 |
+
# self.update_stem_checkbox_labels(self.demucs_stems_var.get(), demucs=True)
|
801 |
+
self.demucs_pre_proc_model_var.set(data['demucs_pre_proc_model'])
|
802 |
+
self.is_demucs_pre_proc_model_activate_var.set(data['is_demucs_pre_proc_model_activate'])
|
803 |
+
self.is_demucs_pre_proc_model_inst_mix_var.set(data['is_demucs_pre_proc_model_inst_mix'])
|
804 |
+
|
805 |
+
if not process_method or process_method == MDX_ARCH_TYPE or is_ensemble:
|
806 |
+
self.mdx_net_model_var.set(loaded_setting['mdx_net_model'])
|
807 |
+
self.chunks_var.set(loaded_setting['chunks'])
|
808 |
+
self.margin_var.set(loaded_setting['margin'])
|
809 |
+
self.compensate_var.set(loaded_setting['compensate'])
|
810 |
+
self.is_denoise_var.set(loaded_setting['is_denoise'])
|
811 |
+
self.is_invert_spec_var.set(loaded_setting['is_invert_spec'])
|
812 |
+
self.is_mixer_mode_var.set(loaded_setting['is_mixer_mode'])
|
813 |
+
self.mdx_batch_size_var.set(loaded_setting['mdx_batch_size'])
|
814 |
+
self.mdx_voc_inst_secondary_model_var.set(loaded_setting['mdx_voc_inst_secondary_model'])
|
815 |
+
self.mdx_other_secondary_model_var.set(loaded_setting['mdx_other_secondary_model'])
|
816 |
+
self.mdx_bass_secondary_model_var.set(loaded_setting['mdx_bass_secondary_model'])
|
817 |
+
self.mdx_drums_secondary_model_var.set(loaded_setting['mdx_drums_secondary_model'])
|
818 |
+
self.mdx_is_secondary_model_activate_var.set(loaded_setting['mdx_is_secondary_model_activate'])
|
819 |
+
self.mdx_voc_inst_secondary_model_scale_var.set(loaded_setting['mdx_voc_inst_secondary_model_scale'])
|
820 |
+
self.mdx_other_secondary_model_scale_var.set(loaded_setting['mdx_other_secondary_model_scale'])
|
821 |
+
self.mdx_bass_secondary_model_scale_var.set(loaded_setting['mdx_bass_secondary_model_scale'])
|
822 |
+
self.mdx_drums_secondary_model_scale_var.set(loaded_setting['mdx_drums_secondary_model_scale'])
|
823 |
+
|
824 |
+
if not process_method or is_ensemble:
|
825 |
+
self.is_save_all_outputs_ensemble_var.set(loaded_setting['is_save_all_outputs_ensemble'])
|
826 |
+
self.is_append_ensemble_name_var.set(loaded_setting['is_append_ensemble_name'])
|
827 |
+
self.chosen_audio_tool_var.set(loaded_setting['chosen_audio_tool'])
|
828 |
+
self.choose_algorithm_var.set(loaded_setting['choose_algorithm'])
|
829 |
+
self.time_stretch_rate_var.set(loaded_setting['time_stretch_rate'])
|
830 |
+
self.pitch_rate_var.set(loaded_setting['pitch_rate'])
|
831 |
+
self.is_primary_stem_only_var.set(loaded_setting['is_primary_stem_only'])
|
832 |
+
self.is_secondary_stem_only_var.set(loaded_setting['is_secondary_stem_only'])
|
833 |
+
self.is_testing_audio_var.set(loaded_setting['is_testing_audio'])
|
834 |
+
self.is_add_model_name_var.set(loaded_setting['is_add_model_name'])
|
835 |
+
self.is_accept_any_input_var.set(loaded_setting["is_accept_any_input"])
|
836 |
+
self.is_task_complete_var.set(loaded_setting['is_task_complete'])
|
837 |
+
self.is_create_model_folder_var.set(loaded_setting['is_create_model_folder'])
|
838 |
+
self.mp3_bit_set_var.set(loaded_setting['mp3_bit_set'])
|
839 |
+
self.save_format_var.set(loaded_setting['save_format'])
|
840 |
+
self.wav_type_set_var.set(loaded_setting['wav_type_set'])
|
841 |
+
self.user_code_var.set(loaded_setting['user_code'])
|
842 |
+
|
843 |
+
self.is_gpu_conversion_var.set(loaded_setting['is_gpu_conversion'])
|
844 |
+
self.is_normalization_var.set(loaded_setting['is_normalization'])
|
845 |
+
self.help_hints_var.set(loaded_setting['help_hints_var'])
|
846 |
+
|
847 |
+
self.model_sample_mode_var.set(loaded_setting['model_sample_mode'])
|
848 |
+
self.model_sample_mode_duration_var.set(loaded_setting['model_sample_mode_duration'])
|
849 |
+
|
850 |
+
|
851 |
+
root = FakeRoot()
|
852 |
+
root.load_saved_settings(DEFAULT_DATA)
|
__version__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
VERSION = 'v5.5.1'
|
2 |
+
PATCH = 'UVR_Patch_3_31_23_5_5'
|
3 |
+
PATCH_MAC = 'UVR_Patch_01_10_12_6_50'
|
4 |
+
PATCH_LINUX = 'UVR_Patch_01_01_23_6_50'
|
app.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.system("python webUI.py")
|
packages.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
git-lfs
|
2 |
+
aria2 -y
|
3 |
+
ffmpeg
|
requirements.txt
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altgraph==0.17.3
|
2 |
+
audioread==3.0.0
|
3 |
+
certifi==2022.12.7
|
4 |
+
cffi==1.15.1
|
5 |
+
cryptography==3.4.6
|
6 |
+
diffq==0.2.3
|
7 |
+
Dora==0.0.3
|
8 |
+
einops==0.6.0
|
9 |
+
future==0.18.2
|
10 |
+
julius==0.2.7
|
11 |
+
kthread==0.2.3
|
12 |
+
librosa==0.9.2
|
13 |
+
llvmlite==0.39.1
|
14 |
+
natsort==8.2.0
|
15 |
+
numba==0.56.4
|
16 |
+
numpy==1.23.4
|
17 |
+
omegaconf==2.2.3
|
18 |
+
opencv-python==4.6.0.66
|
19 |
+
onnx
|
20 |
+
onnxruntime==1.13.1
|
21 |
+
Pillow==9.3.0
|
22 |
+
playsound==1.3.0
|
23 |
+
psutil==5.9.4
|
24 |
+
pydub==0.25.1
|
25 |
+
pyglet==1.5.23
|
26 |
+
pyperclip==1.8.2
|
27 |
+
pyrubberband==0.3.0
|
28 |
+
pytorch_lightning==2.0.0
|
29 |
+
PyYAML==6.0
|
30 |
+
resampy==0.2.2
|
31 |
+
scipy==1.9.3
|
32 |
+
soundfile==0.11.0
|
33 |
+
soundstretch==1.2
|
34 |
+
torch==1.13.1
|
35 |
+
tqdm
|
36 |
+
urllib3==1.26.12
|
37 |
+
wget==3.2
|
38 |
+
samplerate==0.1.0
|
39 |
+
screeninfo==0.8.1
|
40 |
+
PySoundFile==0.9.0.post1; sys_platform != 'windows'
|
41 |
+
SoundFile==0.9.0; sys_platform == 'windows'
|
42 |
+
|
43 |
+
gradio >= 3.19
|
separate.py
ADDED
@@ -0,0 +1,942 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
from typing import TYPE_CHECKING
|
3 |
+
from demucs.apply import apply_model, demucs_segments
|
4 |
+
from demucs.hdemucs import HDemucs
|
5 |
+
from demucs.model_v2 import auto_load_demucs_model_v2
|
6 |
+
from demucs.pretrained import get_model as _gm
|
7 |
+
from demucs.utils import apply_model_v1
|
8 |
+
from demucs.utils import apply_model_v2
|
9 |
+
from lib_v5 import spec_utils
|
10 |
+
from lib_v5.vr_network import nets
|
11 |
+
from lib_v5.vr_network import nets_new
|
12 |
+
#from lib_v5.vr_network.model_param_init import ModelParameters
|
13 |
+
from pathlib import Path
|
14 |
+
from gui_data.constants import *
|
15 |
+
from gui_data.error_handling import *
|
16 |
+
import audioread
|
17 |
+
import gzip
|
18 |
+
import librosa
|
19 |
+
import math
|
20 |
+
import numpy as np
|
21 |
+
import onnxruntime as ort
|
22 |
+
import os
|
23 |
+
import torch
|
24 |
+
import warnings
|
25 |
+
import pydub
|
26 |
+
import soundfile as sf
|
27 |
+
import traceback
|
28 |
+
import lib_v5.mdxnet as MdxnetSet
|
29 |
+
|
30 |
+
if TYPE_CHECKING:
|
31 |
+
from UVR import ModelData
|
32 |
+
|
33 |
+
warnings.filterwarnings("ignore")
|
34 |
+
cpu = torch.device('cpu')
|
35 |
+
|
36 |
+
class SeperateAttributes:
|
37 |
+
def __init__(self, model_data: ModelData, process_data: dict, main_model_primary_stem_4_stem=None, main_process_method=None):
|
38 |
+
|
39 |
+
self.list_all_models: list
|
40 |
+
self.process_data = process_data
|
41 |
+
self.progress_value = 0
|
42 |
+
self.set_progress_bar = process_data['set_progress_bar']
|
43 |
+
self.write_to_console = process_data['write_to_console']
|
44 |
+
self.audio_file = process_data['audio_file']
|
45 |
+
self.audio_file_base = process_data['audio_file_base']
|
46 |
+
self.export_path = process_data['export_path']
|
47 |
+
self.cached_source_callback = process_data['cached_source_callback']
|
48 |
+
self.cached_model_source_holder = process_data['cached_model_source_holder']
|
49 |
+
self.is_4_stem_ensemble = process_data['is_4_stem_ensemble']
|
50 |
+
self.list_all_models = process_data['list_all_models']
|
51 |
+
self.process_iteration = process_data['process_iteration']
|
52 |
+
self.mixer_path = model_data.mixer_path
|
53 |
+
self.model_samplerate = model_data.model_samplerate
|
54 |
+
self.model_capacity = model_data.model_capacity
|
55 |
+
self.is_vr_51_model = model_data.is_vr_51_model
|
56 |
+
self.is_pre_proc_model = model_data.is_pre_proc_model
|
57 |
+
self.is_secondary_model_activated = model_data.is_secondary_model_activated if not self.is_pre_proc_model else False
|
58 |
+
self.is_secondary_model = model_data.is_secondary_model if not self.is_pre_proc_model else True
|
59 |
+
self.process_method = model_data.process_method
|
60 |
+
self.model_path = model_data.model_path
|
61 |
+
self.model_name = model_data.model_name
|
62 |
+
self.model_basename = model_data.model_basename
|
63 |
+
self.wav_type_set = model_data.wav_type_set
|
64 |
+
self.mp3_bit_set = model_data.mp3_bit_set
|
65 |
+
self.save_format = model_data.save_format
|
66 |
+
self.is_gpu_conversion = model_data.is_gpu_conversion
|
67 |
+
self.is_normalization = model_data.is_normalization
|
68 |
+
self.is_primary_stem_only = model_data.is_primary_stem_only if not self.is_secondary_model else model_data.is_primary_model_primary_stem_only
|
69 |
+
self.is_secondary_stem_only = model_data.is_secondary_stem_only if not self.is_secondary_model else model_data.is_primary_model_secondary_stem_only
|
70 |
+
self.is_ensemble_mode = model_data.is_ensemble_mode
|
71 |
+
self.secondary_model = model_data.secondary_model #
|
72 |
+
self.primary_model_primary_stem = model_data.primary_model_primary_stem
|
73 |
+
self.primary_stem = model_data.primary_stem #
|
74 |
+
self.secondary_stem = model_data.secondary_stem #
|
75 |
+
self.is_invert_spec = model_data.is_invert_spec #
|
76 |
+
self.is_mixer_mode = model_data.is_mixer_mode #
|
77 |
+
self.secondary_model_scale = model_data.secondary_model_scale #
|
78 |
+
self.is_demucs_pre_proc_model_inst_mix = model_data.is_demucs_pre_proc_model_inst_mix #
|
79 |
+
self.primary_source_map = {}
|
80 |
+
self.secondary_source_map = {}
|
81 |
+
self.primary_source = None
|
82 |
+
self.secondary_source = None
|
83 |
+
self.secondary_source_primary = None
|
84 |
+
self.secondary_source_secondary = None
|
85 |
+
|
86 |
+
if not model_data.process_method == DEMUCS_ARCH_TYPE:
|
87 |
+
if process_data['is_ensemble_master'] and not self.is_4_stem_ensemble:
|
88 |
+
if not model_data.ensemble_primary_stem == self.primary_stem:
|
89 |
+
self.is_primary_stem_only, self.is_secondary_stem_only = self.is_secondary_stem_only, self.is_primary_stem_only
|
90 |
+
|
91 |
+
if self.is_secondary_model and not process_data['is_ensemble_master']:
|
92 |
+
if not self.primary_model_primary_stem == self.primary_stem and not main_model_primary_stem_4_stem:
|
93 |
+
self.is_primary_stem_only, self.is_secondary_stem_only = self.is_secondary_stem_only, self.is_primary_stem_only
|
94 |
+
|
95 |
+
if main_model_primary_stem_4_stem:
|
96 |
+
self.is_primary_stem_only = True if main_model_primary_stem_4_stem == self.primary_stem else False
|
97 |
+
self.is_secondary_stem_only = True if not main_model_primary_stem_4_stem == self.primary_stem else False
|
98 |
+
|
99 |
+
if self.is_pre_proc_model:
|
100 |
+
self.is_primary_stem_only = True if self.primary_stem == INST_STEM else False
|
101 |
+
self.is_secondary_stem_only = True if self.secondary_stem == INST_STEM else False
|
102 |
+
|
103 |
+
if model_data.process_method == MDX_ARCH_TYPE:
|
104 |
+
self.is_mdx_ckpt = model_data.is_mdx_ckpt
|
105 |
+
self.primary_model_name, self.primary_sources = self.cached_source_callback(MDX_ARCH_TYPE, model_name=self.model_basename)
|
106 |
+
self.is_denoise = model_data.is_denoise
|
107 |
+
self.mdx_batch_size = model_data.mdx_batch_size
|
108 |
+
self.compensate = model_data.compensate
|
109 |
+
self.dim_f, self.dim_t = model_data.mdx_dim_f_set, 2**model_data.mdx_dim_t_set
|
110 |
+
self.n_fft = model_data.mdx_n_fft_scale_set
|
111 |
+
self.chunks = model_data.chunks
|
112 |
+
self.margin = model_data.margin
|
113 |
+
self.adjust = 1
|
114 |
+
self.dim_c = 4
|
115 |
+
self.hop = 1024
|
116 |
+
|
117 |
+
if self.is_gpu_conversion >= 0 and torch.cuda.is_available():
|
118 |
+
self.device, self.run_type = torch.device('cuda:0'), ['CUDAExecutionProvider']
|
119 |
+
else:
|
120 |
+
self.device, self.run_type = torch.device('cpu'), ['CPUExecutionProvider']
|
121 |
+
|
122 |
+
if model_data.process_method == DEMUCS_ARCH_TYPE:
|
123 |
+
self.demucs_stems = model_data.demucs_stems if not main_process_method in [MDX_ARCH_TYPE, VR_ARCH_TYPE] else None
|
124 |
+
self.secondary_model_4_stem = model_data.secondary_model_4_stem
|
125 |
+
self.secondary_model_4_stem_scale = model_data.secondary_model_4_stem_scale
|
126 |
+
self.primary_stem = model_data.ensemble_primary_stem if process_data['is_ensemble_master'] else model_data.primary_stem
|
127 |
+
self.secondary_stem = model_data.ensemble_secondary_stem if process_data['is_ensemble_master'] else model_data.secondary_stem
|
128 |
+
self.is_chunk_demucs = model_data.is_chunk_demucs
|
129 |
+
self.segment = model_data.segment
|
130 |
+
self.demucs_version = model_data.demucs_version
|
131 |
+
self.demucs_source_list = model_data.demucs_source_list
|
132 |
+
self.demucs_source_map = model_data.demucs_source_map
|
133 |
+
self.is_demucs_combine_stems = model_data.is_demucs_combine_stems
|
134 |
+
self.demucs_stem_count = model_data.demucs_stem_count
|
135 |
+
self.pre_proc_model = model_data.pre_proc_model
|
136 |
+
|
137 |
+
if self.is_secondary_model and not process_data['is_ensemble_master']:
|
138 |
+
if not self.demucs_stem_count == 2 and model_data.primary_model_primary_stem == INST_STEM:
|
139 |
+
self.primary_stem = VOCAL_STEM
|
140 |
+
self.secondary_stem = INST_STEM
|
141 |
+
else:
|
142 |
+
self.primary_stem = model_data.primary_model_primary_stem
|
143 |
+
self.secondary_stem = STEM_PAIR_MAPPER[self.primary_stem]
|
144 |
+
|
145 |
+
if self.is_chunk_demucs:
|
146 |
+
self.chunks_demucs = model_data.chunks_demucs
|
147 |
+
self.margin_demucs = model_data.margin_demucs
|
148 |
+
else:
|
149 |
+
self.chunks_demucs = 0
|
150 |
+
self.margin_demucs = 44100
|
151 |
+
|
152 |
+
self.shifts = model_data.shifts
|
153 |
+
self.is_split_mode = model_data.is_split_mode if not self.demucs_version == DEMUCS_V4 else True
|
154 |
+
self.overlap = model_data.overlap
|
155 |
+
self.primary_model_name, self.primary_sources = self.cached_source_callback(DEMUCS_ARCH_TYPE, model_name=self.model_basename)
|
156 |
+
|
157 |
+
if model_data.process_method == VR_ARCH_TYPE:
|
158 |
+
self.primary_model_name, self.primary_sources = self.cached_source_callback(VR_ARCH_TYPE, model_name=self.model_basename)
|
159 |
+
self.mp = model_data.vr_model_param
|
160 |
+
self.high_end_process = model_data.is_high_end_process
|
161 |
+
self.is_tta = model_data.is_tta
|
162 |
+
self.is_post_process = model_data.is_post_process
|
163 |
+
self.is_gpu_conversion = model_data.is_gpu_conversion
|
164 |
+
self.batch_size = model_data.batch_size
|
165 |
+
self.window_size = model_data.window_size
|
166 |
+
self.input_high_end_h = None
|
167 |
+
self.post_process_threshold = model_data.post_process_threshold
|
168 |
+
self.aggressiveness = {'value': model_data.aggression_setting,
|
169 |
+
'split_bin': self.mp.param['band'][1]['crop_stop'],
|
170 |
+
'aggr_correction': self.mp.param.get('aggr_correction')}
|
171 |
+
|
172 |
+
def start_inference_console_write(self):
|
173 |
+
|
174 |
+
if self.is_secondary_model and not self.is_pre_proc_model:
|
175 |
+
self.write_to_console(INFERENCE_STEP_2_SEC(self.process_method, self.model_basename))
|
176 |
+
|
177 |
+
if self.is_pre_proc_model:
|
178 |
+
self.write_to_console(INFERENCE_STEP_2_PRE(self.process_method, self.model_basename))
|
179 |
+
|
180 |
+
def running_inference_console_write(self, is_no_write=False):
|
181 |
+
|
182 |
+
self.write_to_console(DONE, base_text='') if not is_no_write else None
|
183 |
+
self.set_progress_bar(0.05) if not is_no_write else None
|
184 |
+
|
185 |
+
if self.is_secondary_model and not self.is_pre_proc_model:
|
186 |
+
self.write_to_console(INFERENCE_STEP_1_SEC)
|
187 |
+
elif self.is_pre_proc_model:
|
188 |
+
self.write_to_console(INFERENCE_STEP_1_PRE)
|
189 |
+
else:
|
190 |
+
self.write_to_console(INFERENCE_STEP_1)
|
191 |
+
|
192 |
+
def running_inference_progress_bar(self, length, is_match_mix=False):
|
193 |
+
if not is_match_mix:
|
194 |
+
self.progress_value += 1
|
195 |
+
|
196 |
+
if (0.8/length*self.progress_value) >= 0.8:
|
197 |
+
length = self.progress_value + 1
|
198 |
+
|
199 |
+
self.set_progress_bar(0.1, (0.8/length*self.progress_value))
|
200 |
+
|
201 |
+
def load_cached_sources(self, is_4_stem_demucs=False):
|
202 |
+
|
203 |
+
if self.is_secondary_model and not self.is_pre_proc_model:
|
204 |
+
self.write_to_console(INFERENCE_STEP_2_SEC_CACHED_MODOEL(self.process_method, self.model_basename))
|
205 |
+
elif self.is_pre_proc_model:
|
206 |
+
self.write_to_console(INFERENCE_STEP_2_PRE_CACHED_MODOEL(self.process_method, self.model_basename))
|
207 |
+
else:
|
208 |
+
self.write_to_console(INFERENCE_STEP_2_PRIMARY_CACHED)
|
209 |
+
|
210 |
+
if not is_4_stem_demucs:
|
211 |
+
primary_stem, secondary_stem = gather_sources(self.primary_stem, self.secondary_stem, self.primary_sources)
|
212 |
+
|
213 |
+
return primary_stem, secondary_stem
|
214 |
+
|
215 |
+
def cache_source(self, secondary_sources):
|
216 |
+
|
217 |
+
model_occurrences = self.list_all_models.count(self.model_basename)
|
218 |
+
|
219 |
+
if not model_occurrences <= 1:
|
220 |
+
if self.process_method == MDX_ARCH_TYPE:
|
221 |
+
self.cached_model_source_holder(MDX_ARCH_TYPE, secondary_sources, self.model_basename)
|
222 |
+
|
223 |
+
if self.process_method == VR_ARCH_TYPE:
|
224 |
+
self.cached_model_source_holder(VR_ARCH_TYPE, secondary_sources, self.model_basename)
|
225 |
+
|
226 |
+
if self.process_method == DEMUCS_ARCH_TYPE:
|
227 |
+
self.cached_model_source_holder(DEMUCS_ARCH_TYPE, secondary_sources, self.model_basename)
|
228 |
+
|
229 |
+
def write_audio(self, stem_path, stem_source, samplerate, secondary_model_source=None, model_scale=None):
|
230 |
+
|
231 |
+
if not self.is_secondary_model:
|
232 |
+
if self.is_secondary_model_activated:
|
233 |
+
if isinstance(secondary_model_source, np.ndarray):
|
234 |
+
secondary_model_scale = model_scale if model_scale else self.secondary_model_scale
|
235 |
+
stem_source = spec_utils.average_dual_sources(stem_source, secondary_model_source, secondary_model_scale)
|
236 |
+
|
237 |
+
sf.write(stem_path, stem_source, samplerate, subtype=self.wav_type_set)
|
238 |
+
save_format(stem_path, self.save_format, self.mp3_bit_set) if not self.is_ensemble_mode else None
|
239 |
+
|
240 |
+
self.write_to_console(DONE, base_text='')
|
241 |
+
self.set_progress_bar(0.95)
|
242 |
+
|
243 |
+
def run_mixer(self, mix, sources):
|
244 |
+
try:
|
245 |
+
if self.is_mixer_mode and len(sources) == 4:
|
246 |
+
mixer = MdxnetSet.Mixer(self.device, self.mixer_path).eval()
|
247 |
+
with torch.no_grad():
|
248 |
+
mix = torch.tensor(mix, dtype=torch.float32)
|
249 |
+
sources_ = torch.tensor(sources).detach()
|
250 |
+
x = torch.cat([sources_, mix.unsqueeze(0)], 0)
|
251 |
+
sources_ = mixer(x)
|
252 |
+
final_source = np.array(sources_)
|
253 |
+
else:
|
254 |
+
final_source = sources
|
255 |
+
except Exception as e:
|
256 |
+
error_name = f'{type(e).__name__}'
|
257 |
+
traceback_text = ''.join(traceback.format_tb(e.__traceback__))
|
258 |
+
message = f'{error_name}: "{e}"\n{traceback_text}"'
|
259 |
+
print('Mixer Failed: ', message)
|
260 |
+
final_source = sources
|
261 |
+
|
262 |
+
return final_source
|
263 |
+
|
264 |
+
class SeperateMDX(SeperateAttributes):
|
265 |
+
|
266 |
+
def seperate(self):
|
267 |
+
samplerate = 44100
|
268 |
+
|
269 |
+
if self.primary_model_name == self.model_basename and self.primary_sources:
|
270 |
+
self.primary_source, self.secondary_source = self.load_cached_sources()
|
271 |
+
else:
|
272 |
+
self.start_inference_console_write()
|
273 |
+
|
274 |
+
if self.is_mdx_ckpt:
|
275 |
+
model_params = torch.load(self.model_path, map_location=lambda storage, loc: storage)['hyper_parameters']
|
276 |
+
self.dim_c, self.hop = model_params['dim_c'], model_params['hop_length']
|
277 |
+
separator = MdxnetSet.ConvTDFNet(**model_params)
|
278 |
+
self.model_run = separator.load_from_checkpoint(self.model_path).to(self.device).eval()
|
279 |
+
else:
|
280 |
+
ort_ = ort.InferenceSession(self.model_path, providers=self.run_type)
|
281 |
+
self.model_run = lambda spek:ort_.run(None, {'input': spek.cpu().numpy()})[0]
|
282 |
+
|
283 |
+
self.initialize_model_settings()
|
284 |
+
self.running_inference_console_write()
|
285 |
+
mdx_net_cut = True if self.primary_stem in MDX_NET_FREQ_CUT else False
|
286 |
+
mix, raw_mix, samplerate = prepare_mix(self.audio_file, self.chunks, self.margin, mdx_net_cut=mdx_net_cut)
|
287 |
+
source = self.demix_base(mix, is_ckpt=self.is_mdx_ckpt)[0]
|
288 |
+
self.write_to_console(DONE, base_text='')
|
289 |
+
|
290 |
+
if self.is_secondary_model_activated:
|
291 |
+
if self.secondary_model:
|
292 |
+
self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(self.secondary_model, self.process_data, main_process_method=self.process_method)
|
293 |
+
|
294 |
+
if not self.is_secondary_stem_only:
|
295 |
+
self.write_to_console(f'{SAVING_STEM[0]}{self.primary_stem}{SAVING_STEM[1]}') if not self.is_secondary_model else None
|
296 |
+
primary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.primary_stem}).wav')
|
297 |
+
if not isinstance(self.primary_source, np.ndarray):
|
298 |
+
self.primary_source = spec_utils.normalize(source, self.is_normalization).T
|
299 |
+
self.primary_source_map = {self.primary_stem: self.primary_source}
|
300 |
+
self.write_audio(primary_stem_path, self.primary_source, samplerate, self.secondary_source_primary)
|
301 |
+
|
302 |
+
if not self.is_primary_stem_only:
|
303 |
+
self.write_to_console(f'{SAVING_STEM[0]}{self.secondary_stem}{SAVING_STEM[1]}') if not self.is_secondary_model else None
|
304 |
+
secondary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.secondary_stem}).wav')
|
305 |
+
if not isinstance(self.secondary_source, np.ndarray):
|
306 |
+
raw_mix = self.demix_base(raw_mix, is_match_mix=True)[0] if mdx_net_cut else raw_mix
|
307 |
+
self.secondary_source, raw_mix = spec_utils.normalize_two_stem(source*self.compensate, raw_mix, self.is_normalization)
|
308 |
+
|
309 |
+
if self.is_invert_spec:
|
310 |
+
self.secondary_source = spec_utils.invert_stem(raw_mix, self.secondary_source)
|
311 |
+
else:
|
312 |
+
self.secondary_source = (-self.secondary_source.T+raw_mix.T)
|
313 |
+
|
314 |
+
self.secondary_source_map = {self.secondary_stem: self.secondary_source}
|
315 |
+
self.write_audio(secondary_stem_path, self.secondary_source, samplerate, self.secondary_source_secondary)
|
316 |
+
|
317 |
+
torch.cuda.empty_cache()
|
318 |
+
secondary_sources = {**self.primary_source_map, **self.secondary_source_map}
|
319 |
+
|
320 |
+
self.cache_source(secondary_sources)
|
321 |
+
|
322 |
+
if self.is_secondary_model:
|
323 |
+
return secondary_sources
|
324 |
+
|
325 |
+
def initialize_model_settings(self):
|
326 |
+
self.n_bins = self.n_fft//2+1
|
327 |
+
self.trim = self.n_fft//2
|
328 |
+
self.chunk_size = self.hop * (self.dim_t-1)
|
329 |
+
self.window = torch.hann_window(window_length=self.n_fft, periodic=False).to(self.device)
|
330 |
+
self.freq_pad = torch.zeros([1, self.dim_c, self.n_bins-self.dim_f, self.dim_t]).to(self.device)
|
331 |
+
self.gen_size = self.chunk_size-2*self.trim
|
332 |
+
|
333 |
+
def initialize_mix(self, mix, is_ckpt=False):
|
334 |
+
if is_ckpt:
|
335 |
+
pad = self.gen_size + self.trim - ((mix.shape[-1]) % self.gen_size)
|
336 |
+
mixture = np.concatenate((np.zeros((2, self.trim), dtype='float32'),mix, np.zeros((2, pad), dtype='float32')), 1)
|
337 |
+
num_chunks = mixture.shape[-1] // self.gen_size
|
338 |
+
mix_waves = [mixture[:, i * self.gen_size: i * self.gen_size + self.chunk_size] for i in range(num_chunks)]
|
339 |
+
else:
|
340 |
+
mix_waves = []
|
341 |
+
n_sample = mix.shape[1]
|
342 |
+
pad = self.gen_size - n_sample%self.gen_size
|
343 |
+
mix_p = np.concatenate((np.zeros((2,self.trim)), mix, np.zeros((2,pad)), np.zeros((2,self.trim))), 1)
|
344 |
+
i = 0
|
345 |
+
while i < n_sample + pad:
|
346 |
+
waves = np.array(mix_p[:, i:i+self.chunk_size])
|
347 |
+
mix_waves.append(waves)
|
348 |
+
i += self.gen_size
|
349 |
+
|
350 |
+
mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(self.device)
|
351 |
+
|
352 |
+
return mix_waves, pad
|
353 |
+
|
354 |
+
def demix_base(self, mix, is_ckpt=False, is_match_mix=False):
|
355 |
+
chunked_sources = []
|
356 |
+
for slice in mix:
|
357 |
+
sources = []
|
358 |
+
tar_waves_ = []
|
359 |
+
mix_p = mix[slice]
|
360 |
+
mix_waves, pad = self.initialize_mix(mix_p, is_ckpt=is_ckpt)
|
361 |
+
mix_waves = mix_waves.split(self.mdx_batch_size)
|
362 |
+
pad = mix_p.shape[-1] if is_ckpt else -pad
|
363 |
+
with torch.no_grad():
|
364 |
+
for mix_wave in mix_waves:
|
365 |
+
self.running_inference_progress_bar(len(mix)*len(mix_waves), is_match_mix=is_match_mix)
|
366 |
+
tar_waves = self.run_model(mix_wave, is_ckpt=is_ckpt, is_match_mix=is_match_mix)
|
367 |
+
tar_waves_.append(tar_waves)
|
368 |
+
tar_waves_ = np.vstack(tar_waves_)[:, :, self.trim:-self.trim] if is_ckpt else tar_waves_
|
369 |
+
tar_waves = np.concatenate(tar_waves_, axis=-1)[:, :pad]
|
370 |
+
start = 0 if slice == 0 else self.margin
|
371 |
+
end = None if slice == list(mix.keys())[::-1][0] or self.margin == 0 else -self.margin
|
372 |
+
sources.append(tar_waves[:,start:end]*(1/self.adjust))
|
373 |
+
chunked_sources.append(sources)
|
374 |
+
sources = np.concatenate(chunked_sources, axis=-1)
|
375 |
+
|
376 |
+
return sources
|
377 |
+
|
378 |
+
def run_model(self, mix, is_ckpt=False, is_match_mix=False):
|
379 |
+
|
380 |
+
spek = self.stft(mix.to(self.device))*self.adjust
|
381 |
+
spek[:, :, :3, :] *= 0
|
382 |
+
|
383 |
+
if is_match_mix:
|
384 |
+
spec_pred = spek.cpu().numpy()
|
385 |
+
else:
|
386 |
+
spec_pred = -self.model_run(-spek)*0.5+self.model_run(spek)*0.5 if self.is_denoise else self.model_run(spek)
|
387 |
+
|
388 |
+
if is_ckpt:
|
389 |
+
return self.istft(spec_pred).cpu().detach().numpy()
|
390 |
+
else:
|
391 |
+
return self.istft(torch.tensor(spec_pred).to(self.device)).to(cpu)[:,:,self.trim:-self.trim].transpose(0,1).reshape(2, -1).numpy()
|
392 |
+
|
393 |
+
def stft(self, x):
|
394 |
+
x = x.reshape([-1, self.chunk_size])
|
395 |
+
x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True,return_complex=True)
|
396 |
+
x=torch.view_as_real(x)
|
397 |
+
x = x.permute([0,3,1,2])
|
398 |
+
x = x.reshape([-1,2,2,self.n_bins,self.dim_t]).reshape([-1,self.dim_c,self.n_bins,self.dim_t])
|
399 |
+
return x[:,:,:self.dim_f]
|
400 |
+
|
401 |
+
def istft(self, x, freq_pad=None):
|
402 |
+
freq_pad = self.freq_pad.repeat([x.shape[0],1,1,1]) if freq_pad is None else freq_pad
|
403 |
+
x = torch.cat([x, freq_pad], -2)
|
404 |
+
x = x.reshape([-1,2,2,self.n_bins,self.dim_t]).reshape([-1,2,self.n_bins,self.dim_t])
|
405 |
+
x = x.permute([0,2,3,1])
|
406 |
+
x=x.contiguous()
|
407 |
+
x=torch.view_as_complex(x)
|
408 |
+
x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True)
|
409 |
+
return x.reshape([-1,2,self.chunk_size])
|
410 |
+
|
411 |
+
class SeperateDemucs(SeperateAttributes):
|
412 |
+
|
413 |
+
def seperate(self):
|
414 |
+
|
415 |
+
samplerate = 44100
|
416 |
+
source = None
|
417 |
+
model_scale = None
|
418 |
+
stem_source = None
|
419 |
+
stem_source_secondary = None
|
420 |
+
inst_mix = None
|
421 |
+
inst_raw_mix = None
|
422 |
+
raw_mix = None
|
423 |
+
inst_source = None
|
424 |
+
is_no_write = False
|
425 |
+
is_no_piano_guitar = False
|
426 |
+
|
427 |
+
if self.primary_model_name == self.model_basename and type(self.primary_sources) is dict and not self.pre_proc_model:
|
428 |
+
self.primary_source, self.secondary_source = self.load_cached_sources()
|
429 |
+
elif self.primary_model_name == self.model_basename and isinstance(self.primary_sources, np.ndarray) and not self.pre_proc_model:
|
430 |
+
source = self.primary_sources
|
431 |
+
self.load_cached_sources(is_4_stem_demucs=True)
|
432 |
+
else:
|
433 |
+
self.start_inference_console_write()
|
434 |
+
|
435 |
+
if self.is_gpu_conversion >= 0:
|
436 |
+
self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
|
437 |
+
else:
|
438 |
+
self.device = torch.device('cpu')
|
439 |
+
|
440 |
+
if self.demucs_version == DEMUCS_V1:
|
441 |
+
if str(self.model_path).endswith(".gz"):
|
442 |
+
self.model_path = gzip.open(self.model_path, "rb")
|
443 |
+
klass, args, kwargs, state = torch.load(self.model_path)
|
444 |
+
self.demucs = klass(*args, **kwargs)
|
445 |
+
self.demucs.to(self.device)
|
446 |
+
self.demucs.load_state_dict(state)
|
447 |
+
elif self.demucs_version == DEMUCS_V2:
|
448 |
+
self.demucs = auto_load_demucs_model_v2(self.demucs_source_list, self.model_path)
|
449 |
+
self.demucs.to(self.device)
|
450 |
+
self.demucs.load_state_dict(torch.load(self.model_path))
|
451 |
+
self.demucs.eval()
|
452 |
+
else:
|
453 |
+
self.demucs = HDemucs(sources=self.demucs_source_list)
|
454 |
+
self.demucs = _gm(name=os.path.splitext(os.path.basename(self.model_path))[0],
|
455 |
+
repo=Path(os.path.dirname(self.model_path)))
|
456 |
+
self.demucs = demucs_segments(self.segment, self.demucs)
|
457 |
+
self.demucs.to(self.device)
|
458 |
+
self.demucs.eval()
|
459 |
+
|
460 |
+
if self.pre_proc_model:
|
461 |
+
if self.primary_stem not in [VOCAL_STEM, INST_STEM]:
|
462 |
+
is_no_write = True
|
463 |
+
self.write_to_console(DONE, base_text='')
|
464 |
+
mix_no_voc = process_secondary_model(self.pre_proc_model, self.process_data, is_pre_proc_model=True)
|
465 |
+
inst_mix, inst_raw_mix, inst_samplerate = prepare_mix(mix_no_voc[INST_STEM], self.chunks_demucs, self.margin_demucs)
|
466 |
+
self.process_iteration()
|
467 |
+
self.running_inference_console_write(is_no_write=is_no_write)
|
468 |
+
inst_source = self.demix_demucs(inst_mix)
|
469 |
+
inst_source = self.run_mixer(inst_raw_mix, inst_source)
|
470 |
+
self.process_iteration()
|
471 |
+
|
472 |
+
self.running_inference_console_write(is_no_write=is_no_write) if not self.pre_proc_model else None
|
473 |
+
mix, raw_mix, samplerate = prepare_mix(self.audio_file, self.chunks_demucs, self.margin_demucs)
|
474 |
+
|
475 |
+
if self.primary_model_name == self.model_basename and isinstance(self.primary_sources, np.ndarray) and self.pre_proc_model:
|
476 |
+
source = self.primary_sources
|
477 |
+
else:
|
478 |
+
source = self.demix_demucs(mix)
|
479 |
+
source = self.run_mixer(raw_mix, source)
|
480 |
+
|
481 |
+
self.write_to_console(DONE, base_text='')
|
482 |
+
|
483 |
+
del self.demucs
|
484 |
+
torch.cuda.empty_cache()
|
485 |
+
|
486 |
+
if isinstance(inst_source, np.ndarray):
|
487 |
+
source_reshape = spec_utils.reshape_sources(inst_source[self.demucs_source_map[VOCAL_STEM]], source[self.demucs_source_map[VOCAL_STEM]])
|
488 |
+
inst_source[self.demucs_source_map[VOCAL_STEM]] = source_reshape
|
489 |
+
source = inst_source
|
490 |
+
|
491 |
+
if isinstance(source, np.ndarray):
|
492 |
+
if len(source) == 2:
|
493 |
+
self.demucs_source_map = DEMUCS_2_SOURCE_MAPPER
|
494 |
+
else:
|
495 |
+
self.demucs_source_map = DEMUCS_6_SOURCE_MAPPER if len(source) == 6 else DEMUCS_4_SOURCE_MAPPER
|
496 |
+
|
497 |
+
if len(source) == 6 and self.process_data['is_ensemble_master'] or len(source) == 6 and self.is_secondary_model:
|
498 |
+
is_no_piano_guitar = True
|
499 |
+
six_stem_other_source = list(source)
|
500 |
+
six_stem_other_source = [i for n, i in enumerate(source) if n in [self.demucs_source_map[OTHER_STEM], self.demucs_source_map[GUITAR_STEM], self.demucs_source_map[PIANO_STEM]]]
|
501 |
+
other_source = np.zeros_like(six_stem_other_source[0])
|
502 |
+
for i in six_stem_other_source:
|
503 |
+
other_source += i
|
504 |
+
source_reshape = spec_utils.reshape_sources(source[self.demucs_source_map[OTHER_STEM]], other_source)
|
505 |
+
source[self.demucs_source_map[OTHER_STEM]] = source_reshape
|
506 |
+
|
507 |
+
if (self.demucs_stems == ALL_STEMS and not self.process_data['is_ensemble_master']) or self.is_4_stem_ensemble:
|
508 |
+
self.cache_source(source)
|
509 |
+
|
510 |
+
for stem_name, stem_value in self.demucs_source_map.items():
|
511 |
+
if self.is_secondary_model_activated and not self.is_secondary_model and not stem_value >= 4:
|
512 |
+
if self.secondary_model_4_stem[stem_value]:
|
513 |
+
model_scale = self.secondary_model_4_stem_scale[stem_value]
|
514 |
+
stem_source_secondary = process_secondary_model(self.secondary_model_4_stem[stem_value], self.process_data, main_model_primary_stem_4_stem=stem_name, is_4_stem_demucs=True)
|
515 |
+
if isinstance(stem_source_secondary, np.ndarray):
|
516 |
+
stem_source_secondary = stem_source_secondary[1 if self.secondary_model_4_stem[stem_value].demucs_stem_count == 2 else stem_value]
|
517 |
+
stem_source_secondary = spec_utils.normalize(stem_source_secondary, self.is_normalization).T
|
518 |
+
elif type(stem_source_secondary) is dict:
|
519 |
+
stem_source_secondary = stem_source_secondary[stem_name]
|
520 |
+
|
521 |
+
stem_source_secondary = None if stem_value >= 4 else stem_source_secondary
|
522 |
+
self.write_to_console(f'{SAVING_STEM[0]}{stem_name}{SAVING_STEM[1]}') if not self.is_secondary_model else None
|
523 |
+
stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({stem_name}).wav')
|
524 |
+
stem_source = spec_utils.normalize(source[stem_value], self.is_normalization).T
|
525 |
+
self.write_audio(stem_path, stem_source, samplerate, secondary_model_source=stem_source_secondary, model_scale=model_scale)
|
526 |
+
|
527 |
+
if self.is_secondary_model:
|
528 |
+
return source
|
529 |
+
else:
|
530 |
+
if self.is_secondary_model_activated:
|
531 |
+
if self.secondary_model:
|
532 |
+
self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(self.secondary_model, self.process_data, main_process_method=self.process_method)
|
533 |
+
|
534 |
+
if not self.is_secondary_stem_only:
|
535 |
+
self.write_to_console(f'{SAVING_STEM[0]}{self.primary_stem}{SAVING_STEM[1]}') if not self.is_secondary_model else None
|
536 |
+
primary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.primary_stem}).wav')
|
537 |
+
if not isinstance(self.primary_source, np.ndarray):
|
538 |
+
self.primary_source = spec_utils.normalize(source[self.demucs_source_map[self.primary_stem]], self.is_normalization).T
|
539 |
+
self.primary_source_map = {self.primary_stem: self.primary_source}
|
540 |
+
self.write_audio(primary_stem_path, self.primary_source, samplerate, self.secondary_source_primary)
|
541 |
+
|
542 |
+
if not self.is_primary_stem_only:
|
543 |
+
def secondary_save(sec_stem_name, source, raw_mixture=None, is_inst_mixture=False):
|
544 |
+
secondary_source = self.secondary_source if not is_inst_mixture else None
|
545 |
+
self.write_to_console(f'{SAVING_STEM[0]}{sec_stem_name}{SAVING_STEM[1]}') if not self.is_secondary_model else None
|
546 |
+
secondary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({sec_stem_name}).wav')
|
547 |
+
secondary_source_secondary = None
|
548 |
+
|
549 |
+
if not isinstance(secondary_source, np.ndarray):
|
550 |
+
if self.is_demucs_combine_stems:
|
551 |
+
source = list(source)
|
552 |
+
if is_inst_mixture:
|
553 |
+
source = [i for n, i in enumerate(source) if not n in [self.demucs_source_map[self.primary_stem], self.demucs_source_map[VOCAL_STEM]]]
|
554 |
+
else:
|
555 |
+
source.pop(self.demucs_source_map[self.primary_stem])
|
556 |
+
|
557 |
+
source = source[:len(source) - 2] if is_no_piano_guitar else source
|
558 |
+
secondary_source = np.zeros_like(source[0])
|
559 |
+
for i in source:
|
560 |
+
secondary_source += i
|
561 |
+
secondary_source = spec_utils.normalize(secondary_source, self.is_normalization).T
|
562 |
+
else:
|
563 |
+
if not isinstance(raw_mixture, np.ndarray):
|
564 |
+
raw_mixture = prepare_mix(self.audio_file, self.chunks_demucs, self.margin_demucs, is_missing_mix=True)
|
565 |
+
|
566 |
+
secondary_source, raw_mixture = spec_utils.normalize_two_stem(source[self.demucs_source_map[self.primary_stem]], raw_mixture, self.is_normalization)
|
567 |
+
|
568 |
+
if self.is_invert_spec:
|
569 |
+
secondary_source = spec_utils.invert_stem(raw_mixture, secondary_source)
|
570 |
+
else:
|
571 |
+
raw_mixture = spec_utils.reshape_sources(secondary_source, raw_mixture)
|
572 |
+
secondary_source = (-secondary_source.T+raw_mixture.T)
|
573 |
+
|
574 |
+
if not is_inst_mixture:
|
575 |
+
self.secondary_source = secondary_source
|
576 |
+
secondary_source_secondary = self.secondary_source_secondary
|
577 |
+
self.secondary_source_map = {self.secondary_stem: self.secondary_source}
|
578 |
+
|
579 |
+
self.write_audio(secondary_stem_path, secondary_source, samplerate, secondary_source_secondary)
|
580 |
+
|
581 |
+
secondary_save(self.secondary_stem, source, raw_mixture=raw_mix)
|
582 |
+
|
583 |
+
if self.is_demucs_pre_proc_model_inst_mix and self.pre_proc_model and not self.is_4_stem_ensemble:
|
584 |
+
secondary_save(f"{self.secondary_stem} {INST_STEM}", source, raw_mixture=inst_raw_mix, is_inst_mixture=True)
|
585 |
+
|
586 |
+
secondary_sources = {**self.primary_source_map, **self.secondary_source_map}
|
587 |
+
|
588 |
+
self.cache_source(secondary_sources)
|
589 |
+
|
590 |
+
if self.is_secondary_model:
|
591 |
+
return secondary_sources
|
592 |
+
|
593 |
+
def demix_demucs(self, mix):
|
594 |
+
processed = {}
|
595 |
+
|
596 |
+
set_progress_bar = None if self.is_chunk_demucs else self.set_progress_bar
|
597 |
+
|
598 |
+
for nmix in mix:
|
599 |
+
self.progress_value += 1
|
600 |
+
self.set_progress_bar(0.1, (0.8/len(mix)*self.progress_value)) if self.is_chunk_demucs else None
|
601 |
+
cmix = mix[nmix]
|
602 |
+
cmix = torch.tensor(cmix, dtype=torch.float32)
|
603 |
+
ref = cmix.mean(0)
|
604 |
+
cmix = (cmix - ref.mean()) / ref.std()
|
605 |
+
mix_infer = cmix
|
606 |
+
|
607 |
+
with torch.no_grad():
|
608 |
+
if self.demucs_version == DEMUCS_V1:
|
609 |
+
sources = apply_model_v1(self.demucs,
|
610 |
+
mix_infer.to(self.device),
|
611 |
+
self.shifts,
|
612 |
+
self.is_split_mode,
|
613 |
+
set_progress_bar=set_progress_bar)
|
614 |
+
elif self.demucs_version == DEMUCS_V2:
|
615 |
+
sources = apply_model_v2(self.demucs,
|
616 |
+
mix_infer.to(self.device),
|
617 |
+
self.shifts,
|
618 |
+
self.is_split_mode,
|
619 |
+
self.overlap,
|
620 |
+
set_progress_bar=set_progress_bar)
|
621 |
+
else:
|
622 |
+
sources = apply_model(self.demucs,
|
623 |
+
mix_infer[None],
|
624 |
+
self.shifts,
|
625 |
+
self.is_split_mode,
|
626 |
+
self.overlap,
|
627 |
+
static_shifts=1 if self.shifts == 0 else self.shifts,
|
628 |
+
set_progress_bar=set_progress_bar,
|
629 |
+
device=self.device)[0]
|
630 |
+
|
631 |
+
sources = (sources * ref.std() + ref.mean()).cpu().numpy()
|
632 |
+
sources[[0,1]] = sources[[1,0]]
|
633 |
+
start = 0 if nmix == 0 else self.margin_demucs
|
634 |
+
end = None if nmix == list(mix.keys())[::-1][0] else -self.margin_demucs
|
635 |
+
if self.margin_demucs == 0:
|
636 |
+
end = None
|
637 |
+
processed[nmix] = sources[:,:,start:end].copy()
|
638 |
+
sources = list(processed.values())
|
639 |
+
sources = np.concatenate(sources, axis=-1)
|
640 |
+
|
641 |
+
return sources
|
642 |
+
|
643 |
+
class SeperateVR(SeperateAttributes):
|
644 |
+
|
645 |
+
def seperate(self):
|
646 |
+
if self.primary_model_name == self.model_basename and self.primary_sources:
|
647 |
+
self.primary_source, self.secondary_source = self.load_cached_sources()
|
648 |
+
else:
|
649 |
+
self.start_inference_console_write()
|
650 |
+
if self.is_gpu_conversion >= 0:
|
651 |
+
if OPERATING_SYSTEM == 'Darwin':
|
652 |
+
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
|
653 |
+
else:
|
654 |
+
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
|
655 |
+
else:
|
656 |
+
device = torch.device('cpu')
|
657 |
+
|
658 |
+
nn_arch_sizes = [
|
659 |
+
31191, # default
|
660 |
+
33966, 56817, 123821, 123812, 129605, 218409, 537238, 537227]
|
661 |
+
vr_5_1_models = [56817, 218409]
|
662 |
+
model_size = math.ceil(os.stat(self.model_path).st_size / 1024)
|
663 |
+
nn_arch_size = min(nn_arch_sizes, key=lambda x:abs(x-model_size))
|
664 |
+
|
665 |
+
if nn_arch_size in vr_5_1_models or self.is_vr_51_model:
|
666 |
+
self.model_run = nets_new.CascadedNet(self.mp.param['bins'] * 2, nn_arch_size, nout=self.model_capacity[0], nout_lstm=self.model_capacity[1])
|
667 |
+
else:
|
668 |
+
self.model_run = nets.determine_model_capacity(self.mp.param['bins'] * 2, nn_arch_size)
|
669 |
+
|
670 |
+
self.model_run.load_state_dict(torch.load(self.model_path, map_location=cpu))
|
671 |
+
self.model_run.to(device)
|
672 |
+
|
673 |
+
self.running_inference_console_write()
|
674 |
+
|
675 |
+
y_spec, v_spec = self.inference_vr(self.loading_mix(), device, self.aggressiveness)
|
676 |
+
self.write_to_console(DONE, base_text='')
|
677 |
+
|
678 |
+
if self.is_secondary_model_activated:
|
679 |
+
if self.secondary_model:
|
680 |
+
self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(self.secondary_model, self.process_data, main_process_method=self.process_method)
|
681 |
+
|
682 |
+
if not self.is_secondary_stem_only:
|
683 |
+
self.write_to_console(f'{SAVING_STEM[0]}{self.primary_stem}{SAVING_STEM[1]}') if not self.is_secondary_model else None
|
684 |
+
primary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.primary_stem}).wav')
|
685 |
+
if not isinstance(self.primary_source, np.ndarray):
|
686 |
+
self.primary_source = spec_utils.normalize(self.spec_to_wav(y_spec), self.is_normalization).T
|
687 |
+
if not self.model_samplerate == 44100:
|
688 |
+
self.primary_source = librosa.resample(self.primary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
|
689 |
+
|
690 |
+
self.primary_source_map = {self.primary_stem: self.primary_source}
|
691 |
+
|
692 |
+
self.write_audio(primary_stem_path, self.primary_source, 44100, self.secondary_source_primary)
|
693 |
+
|
694 |
+
if not self.is_primary_stem_only:
|
695 |
+
self.write_to_console(f'{SAVING_STEM[0]}{self.secondary_stem}{SAVING_STEM[1]}') if not self.is_secondary_model else None
|
696 |
+
secondary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.secondary_stem}).wav')
|
697 |
+
if not isinstance(self.secondary_source, np.ndarray):
|
698 |
+
self.secondary_source = self.spec_to_wav(v_spec)
|
699 |
+
self.secondary_source = spec_utils.normalize(self.spec_to_wav(v_spec), self.is_normalization).T
|
700 |
+
if not self.model_samplerate == 44100:
|
701 |
+
self.secondary_source = librosa.resample(self.secondary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
|
702 |
+
|
703 |
+
self.secondary_source_map = {self.secondary_stem: self.secondary_source}
|
704 |
+
|
705 |
+
self.write_audio(secondary_stem_path, self.secondary_source, 44100, self.secondary_source_secondary)
|
706 |
+
|
707 |
+
torch.cuda.empty_cache()
|
708 |
+
secondary_sources = {**self.primary_source_map, **self.secondary_source_map}
|
709 |
+
self.cache_source(secondary_sources)
|
710 |
+
|
711 |
+
if self.is_secondary_model:
|
712 |
+
return secondary_sources
|
713 |
+
|
714 |
+
def loading_mix(self):
|
715 |
+
|
716 |
+
X_wave, X_spec_s = {}, {}
|
717 |
+
|
718 |
+
bands_n = len(self.mp.param['band'])
|
719 |
+
|
720 |
+
for d in range(bands_n, 0, -1):
|
721 |
+
bp = self.mp.param['band'][d]
|
722 |
+
|
723 |
+
if OPERATING_SYSTEM == 'Darwin':
|
724 |
+
wav_resolution = 'polyphase' if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else bp['res_type']
|
725 |
+
else:
|
726 |
+
wav_resolution = bp['res_type']
|
727 |
+
|
728 |
+
if d == bands_n: # high-end band
|
729 |
+
X_wave[d], _ = librosa.load(self.audio_file, bp['sr'], False, dtype=np.float32, res_type=wav_resolution)
|
730 |
+
|
731 |
+
if not np.any(X_wave[d]) and self.audio_file.endswith('.mp3'):
|
732 |
+
X_wave[d] = rerun_mp3(self.audio_file, bp['sr'])
|
733 |
+
|
734 |
+
if X_wave[d].ndim == 1:
|
735 |
+
X_wave[d] = np.asarray([X_wave[d], X_wave[d]])
|
736 |
+
else: # lower bands
|
737 |
+
X_wave[d] = librosa.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=wav_resolution)
|
738 |
+
|
739 |
+
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'],
|
740 |
+
self.mp.param['mid_side_b2'], self.mp.param['reverse'])
|
741 |
+
|
742 |
+
if d == bands_n and self.high_end_process != 'none':
|
743 |
+
self.input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + (self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
|
744 |
+
self.input_high_end = X_spec_s[d][:, bp['n_fft']//2-self.input_high_end_h:bp['n_fft']//2, :]
|
745 |
+
|
746 |
+
X_spec = spec_utils.combine_spectrograms(X_spec_s, self.mp)
|
747 |
+
|
748 |
+
del X_wave, X_spec_s
|
749 |
+
|
750 |
+
return X_spec
|
751 |
+
|
752 |
+
def inference_vr(self, X_spec, device, aggressiveness):
|
753 |
+
def _execute(X_mag_pad, roi_size):
|
754 |
+
X_dataset = []
|
755 |
+
patches = (X_mag_pad.shape[2] - 2 * self.model_run.offset) // roi_size
|
756 |
+
total_iterations = patches//self.batch_size if not self.is_tta else (patches//self.batch_size)*2
|
757 |
+
for i in range(patches):
|
758 |
+
start = i * roi_size
|
759 |
+
X_mag_window = X_mag_pad[:, :, start:start + self.window_size]
|
760 |
+
X_dataset.append(X_mag_window)
|
761 |
+
|
762 |
+
X_dataset = np.asarray(X_dataset)
|
763 |
+
self.model_run.eval()
|
764 |
+
with torch.no_grad():
|
765 |
+
mask = []
|
766 |
+
for i in range(0, patches, self.batch_size):
|
767 |
+
self.progress_value += 1
|
768 |
+
if self.progress_value >= total_iterations:
|
769 |
+
self.progress_value = total_iterations
|
770 |
+
self.set_progress_bar(0.1, 0.8/total_iterations*self.progress_value)
|
771 |
+
X_batch = X_dataset[i: i + self.batch_size]
|
772 |
+
X_batch = torch.from_numpy(X_batch).to(device)
|
773 |
+
pred = self.model_run.predict_mask(X_batch)
|
774 |
+
if not pred.size()[3] > 0:
|
775 |
+
raise Exception(ERROR_MAPPER[WINDOW_SIZE_ERROR])
|
776 |
+
pred = pred.detach().cpu().numpy()
|
777 |
+
pred = np.concatenate(pred, axis=2)
|
778 |
+
mask.append(pred)
|
779 |
+
if len(mask) == 0:
|
780 |
+
raise Exception(ERROR_MAPPER[WINDOW_SIZE_ERROR])
|
781 |
+
|
782 |
+
mask = np.concatenate(mask, axis=2)
|
783 |
+
return mask
|
784 |
+
|
785 |
+
def postprocess(mask, X_mag, X_phase):
|
786 |
+
|
787 |
+
is_non_accom_stem = False
|
788 |
+
for stem in NON_ACCOM_STEMS:
|
789 |
+
if stem == self.primary_stem:
|
790 |
+
is_non_accom_stem = True
|
791 |
+
|
792 |
+
mask = spec_utils.adjust_aggr(mask, is_non_accom_stem, aggressiveness)
|
793 |
+
|
794 |
+
if self.is_post_process:
|
795 |
+
mask = spec_utils.merge_artifacts(mask, thres=self.post_process_threshold)
|
796 |
+
|
797 |
+
y_spec = mask * X_mag * np.exp(1.j * X_phase)
|
798 |
+
v_spec = (1 - mask) * X_mag * np.exp(1.j * X_phase)
|
799 |
+
|
800 |
+
return y_spec, v_spec
|
801 |
+
X_mag, X_phase = spec_utils.preprocess(X_spec)
|
802 |
+
n_frame = X_mag.shape[2]
|
803 |
+
pad_l, pad_r, roi_size = spec_utils.make_padding(n_frame, self.window_size, self.model_run.offset)
|
804 |
+
X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
|
805 |
+
X_mag_pad /= X_mag_pad.max()
|
806 |
+
mask = _execute(X_mag_pad, roi_size)
|
807 |
+
|
808 |
+
if self.is_tta:
|
809 |
+
pad_l += roi_size // 2
|
810 |
+
pad_r += roi_size // 2
|
811 |
+
X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
|
812 |
+
X_mag_pad /= X_mag_pad.max()
|
813 |
+
mask_tta = _execute(X_mag_pad, roi_size)
|
814 |
+
mask_tta = mask_tta[:, :, roi_size // 2:]
|
815 |
+
mask = (mask[:, :, :n_frame] + mask_tta[:, :, :n_frame]) * 0.5
|
816 |
+
else:
|
817 |
+
mask = mask[:, :, :n_frame]
|
818 |
+
|
819 |
+
y_spec, v_spec = postprocess(mask, X_mag, X_phase)
|
820 |
+
|
821 |
+
return y_spec, v_spec
|
822 |
+
|
823 |
+
def spec_to_wav(self, spec):
|
824 |
+
|
825 |
+
if self.high_end_process.startswith('mirroring'):
|
826 |
+
input_high_end_ = spec_utils.mirroring(self.high_end_process, spec, self.input_high_end, self.mp)
|
827 |
+
wav = spec_utils.cmb_spectrogram_to_wave(spec, self.mp, self.input_high_end_h, input_high_end_)
|
828 |
+
else:
|
829 |
+
wav = spec_utils.cmb_spectrogram_to_wave(spec, self.mp)
|
830 |
+
|
831 |
+
return wav
|
832 |
+
|
833 |
+
def process_secondary_model(secondary_model: ModelData, process_data, main_model_primary_stem_4_stem=None, is_4_stem_demucs=False, main_process_method=None, is_pre_proc_model=False):
|
834 |
+
|
835 |
+
if not is_pre_proc_model:
|
836 |
+
process_iteration = process_data['process_iteration']
|
837 |
+
process_iteration()
|
838 |
+
|
839 |
+
if secondary_model.process_method == VR_ARCH_TYPE:
|
840 |
+
seperator = SeperateVR(secondary_model, process_data, main_model_primary_stem_4_stem=main_model_primary_stem_4_stem, main_process_method=main_process_method)
|
841 |
+
if secondary_model.process_method == MDX_ARCH_TYPE:
|
842 |
+
seperator = SeperateMDX(secondary_model, process_data, main_model_primary_stem_4_stem=main_model_primary_stem_4_stem, main_process_method=main_process_method)
|
843 |
+
if secondary_model.process_method == DEMUCS_ARCH_TYPE:
|
844 |
+
seperator = SeperateDemucs(secondary_model, process_data, main_model_primary_stem_4_stem=main_model_primary_stem_4_stem, main_process_method=main_process_method)
|
845 |
+
|
846 |
+
secondary_sources = seperator.seperate()
|
847 |
+
|
848 |
+
if type(secondary_sources) is dict and not is_4_stem_demucs and not is_pre_proc_model:
|
849 |
+
return gather_sources(secondary_model.primary_model_primary_stem, STEM_PAIR_MAPPER[secondary_model.primary_model_primary_stem], secondary_sources)
|
850 |
+
else:
|
851 |
+
return secondary_sources
|
852 |
+
|
853 |
+
def gather_sources(primary_stem_name, secondary_stem_name, secondary_sources: dict):
|
854 |
+
|
855 |
+
source_primary = False
|
856 |
+
source_secondary = False
|
857 |
+
|
858 |
+
for key, value in secondary_sources.items():
|
859 |
+
if key in primary_stem_name:
|
860 |
+
source_primary = value
|
861 |
+
if key in secondary_stem_name:
|
862 |
+
source_secondary = value
|
863 |
+
|
864 |
+
return source_primary, source_secondary
|
865 |
+
|
866 |
+
def prepare_mix(mix, chunk_set, margin_set, mdx_net_cut=False, is_missing_mix=False):
|
867 |
+
|
868 |
+
audio_path = mix
|
869 |
+
samplerate = 44100
|
870 |
+
|
871 |
+
if not isinstance(mix, np.ndarray):
|
872 |
+
mix, samplerate = librosa.load(mix, mono=False, sr=44100)
|
873 |
+
else:
|
874 |
+
mix = mix.T
|
875 |
+
|
876 |
+
if not np.any(mix) and audio_path.endswith('.mp3'):
|
877 |
+
mix = rerun_mp3(audio_path)
|
878 |
+
|
879 |
+
if mix.ndim == 1:
|
880 |
+
mix = np.asfortranarray([mix,mix])
|
881 |
+
|
882 |
+
def get_segmented_mix(chunk_set=chunk_set):
|
883 |
+
segmented_mix = {}
|
884 |
+
|
885 |
+
samples = mix.shape[-1]
|
886 |
+
margin = margin_set
|
887 |
+
chunk_size = chunk_set*44100
|
888 |
+
assert not margin == 0, 'margin cannot be zero!'
|
889 |
+
|
890 |
+
if margin > chunk_size:
|
891 |
+
margin = chunk_size
|
892 |
+
if chunk_set == 0 or samples < chunk_size:
|
893 |
+
chunk_size = samples
|
894 |
+
|
895 |
+
counter = -1
|
896 |
+
for skip in range(0, samples, chunk_size):
|
897 |
+
counter+=1
|
898 |
+
s_margin = 0 if counter == 0 else margin
|
899 |
+
end = min(skip+chunk_size+margin, samples)
|
900 |
+
start = skip-s_margin
|
901 |
+
segmented_mix[skip] = mix[:,start:end].copy()
|
902 |
+
if end == samples:
|
903 |
+
break
|
904 |
+
|
905 |
+
return segmented_mix
|
906 |
+
|
907 |
+
if is_missing_mix:
|
908 |
+
return mix
|
909 |
+
else:
|
910 |
+
segmented_mix = get_segmented_mix()
|
911 |
+
raw_mix = get_segmented_mix(chunk_set=0) if mdx_net_cut else mix
|
912 |
+
return segmented_mix, raw_mix, samplerate
|
913 |
+
|
914 |
+
def rerun_mp3(audio_file, sample_rate=44100):
|
915 |
+
|
916 |
+
with audioread.audio_open(audio_file) as f:
|
917 |
+
track_length = int(f.duration)
|
918 |
+
|
919 |
+
return librosa.load(audio_file, duration=track_length, mono=False, sr=sample_rate)[0]
|
920 |
+
|
921 |
+
def save_format(audio_path, save_format, mp3_bit_set):
|
922 |
+
|
923 |
+
if not save_format == WAV:
|
924 |
+
|
925 |
+
if OPERATING_SYSTEM == 'Darwin':
|
926 |
+
FFMPEG_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'ffmpeg')
|
927 |
+
pydub.AudioSegment.converter = FFMPEG_PATH
|
928 |
+
|
929 |
+
musfile = pydub.AudioSegment.from_wav(audio_path)
|
930 |
+
|
931 |
+
if save_format == FLAC:
|
932 |
+
audio_path_flac = audio_path.replace(".wav", ".flac")
|
933 |
+
musfile.export(audio_path_flac, format="flac")
|
934 |
+
|
935 |
+
if save_format == MP3:
|
936 |
+
audio_path_mp3 = audio_path.replace(".wav", ".mp3")
|
937 |
+
musfile.export(audio_path_mp3, format="mp3", bitrate=mp3_bit_set)
|
938 |
+
|
939 |
+
try:
|
940 |
+
os.remove(audio_path)
|
941 |
+
except Exception as e:
|
942 |
+
print(e)
|
webUI.py
ADDED
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
|
4 |
+
import librosa
|
5 |
+
import soundfile
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
from UVR_interface import root, UVRInterface, VR_MODELS_DIR, MDX_MODELS_DIR, DEMUCS_MODELS_DIR
|
10 |
+
from gui_data.constants import *
|
11 |
+
from typing import List, Dict, Callable, Union
|
12 |
+
|
13 |
+
|
14 |
+
class UVRWebUI:
|
15 |
+
def __init__(self, uvr: UVRInterface, online_data_path: str) -> None:
|
16 |
+
self.uvr = uvr
|
17 |
+
self.models_url = self.get_models_url(online_data_path)
|
18 |
+
self.define_layout()
|
19 |
+
|
20 |
+
self.input_temp_dir = "__temp"
|
21 |
+
self.export_path = "out"
|
22 |
+
if not os.path.exists(self.input_temp_dir):
|
23 |
+
os.mkdir(self.input_temp_dir)
|
24 |
+
|
25 |
+
def get_models_url(self, models_info_path: str) -> Dict[str, Dict]:
|
26 |
+
with open(models_info_path, "r") as f:
|
27 |
+
online_data = json.loads(f.read())
|
28 |
+
models_url = {}
|
29 |
+
for arch, download_list_key in zip([VR_ARCH_TYPE, MDX_ARCH_TYPE], ["vr_download_list", "mdx_download_list"]):
|
30 |
+
models_url[arch] = {model: NORMAL_REPO+model_path for model, model_path in online_data[download_list_key].items()}
|
31 |
+
models_url[DEMUCS_ARCH_TYPE] = online_data["demucs_download_list"]
|
32 |
+
return models_url
|
33 |
+
|
34 |
+
def get_local_models(self, arch: str) -> List[str]:
|
35 |
+
model_config = {
|
36 |
+
VR_ARCH_TYPE: (VR_MODELS_DIR, ".pth"),
|
37 |
+
MDX_ARCH_TYPE: (MDX_MODELS_DIR, ".onnx"),
|
38 |
+
DEMUCS_ARCH_TYPE: (DEMUCS_MODELS_DIR, ".yaml"),
|
39 |
+
}
|
40 |
+
try:
|
41 |
+
model_dir, suffix = model_config[arch]
|
42 |
+
except KeyError:
|
43 |
+
raise ValueError(f"Unkown arch type: {arch}")
|
44 |
+
return [os.path.splitext(f)[0] for f in os.listdir(model_dir) if f.endswith(suffix)]
|
45 |
+
|
46 |
+
def set_arch_setting_value(self, arch: str, setting1, setting2):
|
47 |
+
if arch == VR_ARCH_TYPE:
|
48 |
+
root.window_size_var.set(setting1)
|
49 |
+
root.aggression_setting_var.set(setting2)
|
50 |
+
elif arch == MDX_ARCH_TYPE:
|
51 |
+
root.mdx_batch_size_var.set(setting1)
|
52 |
+
root.compensate_var.set(setting2)
|
53 |
+
elif arch == DEMUCS_ARCH_TYPE:
|
54 |
+
pass
|
55 |
+
|
56 |
+
def arch_select_update(self, arch: str) -> List[Dict]:
|
57 |
+
choices = self.get_local_models(arch)
|
58 |
+
if arch == VR_ARCH_TYPE:
|
59 |
+
model_update = self.model_choice.update(choices=choices, value=CHOOSE_MODEL, label=SELECT_VR_MODEL_MAIN_LABEL)
|
60 |
+
setting1_update = self.arch_setting1.update(choices=VR_WINDOW, label=WINDOW_SIZE_MAIN_LABEL, value=root.window_size_var.get())
|
61 |
+
setting2_update = self.arch_setting2.update(choices=VR_AGGRESSION, label=AGGRESSION_SETTING_MAIN_LABEL, value=root.aggression_setting_var.get())
|
62 |
+
elif arch == MDX_ARCH_TYPE:
|
63 |
+
model_update = self.model_choice.update(choices=choices, value=CHOOSE_MODEL, label=CHOOSE_MDX_MODEL_MAIN_LABEL)
|
64 |
+
setting1_update = self.arch_setting1.update(choices=BATCH_SIZE, label=BATCHES_MDX_MAIN_LABEL, value=root.mdx_batch_size_var.get())
|
65 |
+
setting2_update = self.arch_setting2.update(choices=VOL_COMPENSATION, label=VOL_COMP_MDX_MAIN_LABEL, value=root.compensate_var.get())
|
66 |
+
elif arch == DEMUCS_ARCH_TYPE:
|
67 |
+
model_update = self.model_choice.update(choices=choices, value=CHOOSE_MODEL, label=CHOOSE_DEMUCS_MODEL_MAIN_LABEL)
|
68 |
+
raise gr.Error(f"{DEMUCS_ARCH_TYPE} not implempted")
|
69 |
+
else:
|
70 |
+
raise gr.Error(f"Unkown arch type: {arch}")
|
71 |
+
return [model_update, setting1_update, setting2_update]
|
72 |
+
|
73 |
+
def model_select_update(self, arch: str, model_name: str) -> List[Union[str, Dict, None]]:
|
74 |
+
if model_name == CHOOSE_MODEL:
|
75 |
+
return [None for _ in range(4)]
|
76 |
+
model, = self.uvr.assemble_model_data(model_name, arch)
|
77 |
+
if not model.model_status:
|
78 |
+
raise gr.Error(f"Cannot get model data, model hash = {model.model_hash}")
|
79 |
+
|
80 |
+
stem1_check_update = self.primary_stem_only.update(label=f"{model.primary_stem} Only")
|
81 |
+
stem2_check_update = self.secondary_stem_only.update(label=f"{model.secondary_stem} Only")
|
82 |
+
stem1_out_update = self.primary_stem_out.update(label=f"Output {model.primary_stem}")
|
83 |
+
stem2_out_update = self.secondary_stem_out.update(label=f"Output {model.secondary_stem}")
|
84 |
+
|
85 |
+
return [stem1_check_update, stem2_check_update, stem1_out_update, stem2_out_update]
|
86 |
+
|
87 |
+
def checkbox_set_root_value(self, checkbox: gr.Checkbox, root_attr: str):
|
88 |
+
checkbox.change(lambda value: root.__getattribute__(root_attr).set(value), inputs=checkbox)
|
89 |
+
|
90 |
+
def set_checkboxes_exclusive(self, checkboxes: List[gr.Checkbox], pure_callbacks: List[Callable], exclusive_value=True):
|
91 |
+
def exclusive_onchange(i, callback_i):
|
92 |
+
def new_onchange(*check_values):
|
93 |
+
if check_values[i] == exclusive_value:
|
94 |
+
return_values = []
|
95 |
+
for j, value_j in enumerate(check_values):
|
96 |
+
if j != i and value_j == exclusive_value:
|
97 |
+
return_values.append(not exclusive_value)
|
98 |
+
else:
|
99 |
+
return_values.append(value_j)
|
100 |
+
else:
|
101 |
+
return_values = check_values
|
102 |
+
callback_i(check_values[i])
|
103 |
+
return return_values
|
104 |
+
return new_onchange
|
105 |
+
|
106 |
+
for i, (checkbox, callback) in enumerate(zip(checkboxes, pure_callbacks)):
|
107 |
+
checkbox.change(exclusive_onchange(i, callback), inputs=checkboxes, outputs=checkboxes)
|
108 |
+
|
109 |
+
def process(self, input_audio, input_filename, model_name, arch, setting1, setting2, progress=gr.Progress()):
|
110 |
+
def set_progress_func(step, inference_iterations=0):
|
111 |
+
progress_curr = step + inference_iterations
|
112 |
+
progress(progress_curr)
|
113 |
+
|
114 |
+
sampling_rate, audio = input_audio
|
115 |
+
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
116 |
+
if len(audio.shape) > 1:
|
117 |
+
audio = librosa.to_mono(audio.transpose(1, 0))
|
118 |
+
input_path = os.path.join(self.input_temp_dir, input_filename)
|
119 |
+
soundfile.write(input_path, audio, sampling_rate, format="wav")
|
120 |
+
|
121 |
+
self.set_arch_setting_value(arch, setting1, setting2)
|
122 |
+
|
123 |
+
seperator = uvr.process(
|
124 |
+
model_name=model_name,
|
125 |
+
arch_type=arch,
|
126 |
+
audio_file=input_path,
|
127 |
+
export_path=self.export_path,
|
128 |
+
is_model_sample_mode=root.model_sample_mode_var.get(),
|
129 |
+
set_progress_func=set_progress_func,
|
130 |
+
)
|
131 |
+
|
132 |
+
primary_audio = None
|
133 |
+
secondary_audio = None
|
134 |
+
msg = ""
|
135 |
+
if not seperator.is_secondary_stem_only:
|
136 |
+
primary_stem_path = os.path.join(seperator.export_path, f"{seperator.audio_file_base}_({seperator.primary_stem}).wav")
|
137 |
+
audio, rate = soundfile.read(primary_stem_path)
|
138 |
+
primary_audio = (rate, audio)
|
139 |
+
msg += f"{seperator.primary_stem} saved at {primary_stem_path}\n"
|
140 |
+
if not seperator.is_primary_stem_only:
|
141 |
+
secondary_stem_path = os.path.join(seperator.export_path, f"{seperator.audio_file_base}_({seperator.secondary_stem}).wav")
|
142 |
+
audio, rate = soundfile.read(secondary_stem_path)
|
143 |
+
secondary_audio = (rate, audio)
|
144 |
+
msg += f"{seperator.secondary_stem} saved at {secondary_stem_path}\n"
|
145 |
+
|
146 |
+
os.remove(input_path)
|
147 |
+
|
148 |
+
return primary_audio, secondary_audio, msg
|
149 |
+
|
150 |
+
def define_layout(self):
|
151 |
+
with gr.Blocks() as app:
|
152 |
+
self.app = app
|
153 |
+
gr.HTML("<h1> 🎵 Ultimate Vocal Remover WebUI 🎵 </h1>")
|
154 |
+
gr.Markdown("This is an experimental demo with CPU. Duplicate the space for use in private")
|
155 |
+
gr.Markdown(
|
156 |
+
"[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg)](https://huggingface.co/spaces/r3gm/Ultimate-Vocal-Remover-WebUI?duplicate=true)\n\n"
|
157 |
+
)
|
158 |
+
with gr.Tabs():
|
159 |
+
with gr.TabItem("process"):
|
160 |
+
with gr.Row():
|
161 |
+
self.arch_choice = gr.Dropdown(
|
162 |
+
choices=[VR_ARCH_TYPE, MDX_ARCH_TYPE], value=VR_ARCH_TYPE, # choices=[VR_ARCH_TYPE, MDX_ARCH_TYPE, DEMUCS_ARCH_TYPE], value=VR_ARCH_TYPE,
|
163 |
+
label=CHOOSE_PROC_METHOD_MAIN_LABEL, interactive=True)
|
164 |
+
self.model_choice = gr.Dropdown(
|
165 |
+
choices=self.get_local_models(VR_ARCH_TYPE), value=CHOOSE_MODEL,
|
166 |
+
label=SELECT_VR_MODEL_MAIN_LABEL+' 👋Select a model', interactive=True)
|
167 |
+
with gr.Row():
|
168 |
+
self.arch_setting1 = gr.Dropdown(
|
169 |
+
choices=VR_WINDOW, value=root.window_size_var.get(),
|
170 |
+
label=WINDOW_SIZE_MAIN_LABEL+' 👋Select one', interactive=True)
|
171 |
+
self.arch_setting2 = gr.Dropdown(
|
172 |
+
choices=VR_AGGRESSION, value=root.aggression_setting_var.get(),
|
173 |
+
label=AGGRESSION_SETTING_MAIN_LABEL, interactive=True)
|
174 |
+
with gr.Row():
|
175 |
+
self.use_gpu = gr.Checkbox(
|
176 |
+
label='Rhythmic Transmutation Device', value=True, interactive=True) #label=GPU_CONVERSION_MAIN_LABEL, value=root.is_gpu_conversion_var.get(), interactive=True)
|
177 |
+
self.primary_stem_only = gr.Checkbox(
|
178 |
+
label=f"{PRIMARY_STEM} only", value=root.is_primary_stem_only_var.get(), interactive=True)
|
179 |
+
self.secondary_stem_only = gr.Checkbox(
|
180 |
+
label=f"{SECONDARY_STEM} only", value=root.is_secondary_stem_only_var.get(), interactive=True)
|
181 |
+
self.sample_mode = gr.Checkbox(
|
182 |
+
label=SAMPLE_MODE_CHECKBOX(root.model_sample_mode_duration_var.get()),
|
183 |
+
value=root.model_sample_mode_var.get(), interactive=True)
|
184 |
+
|
185 |
+
with gr.Row():
|
186 |
+
self.input_filename = gr.Textbox(label="Input filename", value="temp.wav", interactive=True)
|
187 |
+
with gr.Row():
|
188 |
+
self.audio_in = gr.Audio(label="Input audio", interactive=True)
|
189 |
+
with gr.Row():
|
190 |
+
self.process_submit = gr.Button(START_PROCESSING, variant="primary")
|
191 |
+
with gr.Row():
|
192 |
+
self.primary_stem_out = gr.Audio(label=f"Output {PRIMARY_STEM}", interactive=False)
|
193 |
+
self.secondary_stem_out = gr.Audio(label=f"Output {SECONDARY_STEM}", interactive=False)
|
194 |
+
with gr.Row():
|
195 |
+
self.out_message = gr.Textbox(label="Output Message", interactive=False, show_progress=False)
|
196 |
+
|
197 |
+
with gr.TabItem("settings"):
|
198 |
+
with gr.Tabs():
|
199 |
+
with gr.TabItem("Settings Guide"):
|
200 |
+
pass
|
201 |
+
with gr.TabItem("Additional Settigns"):
|
202 |
+
self.wav_type = gr.Dropdown(choices=WAV_TYPE, label="Wav Type", value="PCM_16", interactive=True)
|
203 |
+
self.mp3_rate = gr.Dropdown(choices=MP3_BIT_RATES, label="MP3 Bitrate", value="320k",interactive=True)
|
204 |
+
with gr.TabItem("Download models"):
|
205 |
+
|
206 |
+
def md_url(url, text=None):
|
207 |
+
if text is None:
|
208 |
+
text = url
|
209 |
+
return f"[{url}]({url})"
|
210 |
+
|
211 |
+
with gr.Row():
|
212 |
+
vr_models = self.models_url[VR_ARCH_TYPE]
|
213 |
+
self.vr_download_choice = gr.Dropdown(choices=list(vr_models.keys()), label=f"Select {VR_ARCH_TYPE} Model", interactive=True)
|
214 |
+
self.vr_download_url = gr.Markdown()
|
215 |
+
self.vr_download_choice.change(lambda model: md_url(vr_models[model]), inputs=self.vr_download_choice, outputs=self.vr_download_url)
|
216 |
+
with gr.Row(variant="panel"):
|
217 |
+
mdx_models = self.models_url[MDX_ARCH_TYPE]
|
218 |
+
self.mdx_download_choice = gr.Dropdown(choices=list(mdx_models.keys()), label=f"Select {MDX_ARCH_TYPE} Model", interactive=True)
|
219 |
+
self.mdx_download_url = gr.Markdown()
|
220 |
+
self.mdx_download_choice.change(lambda model: md_url(mdx_models[model]), inputs=self.mdx_download_choice, outputs=self.mdx_download_url)
|
221 |
+
with gr.Row(variant="panel"):
|
222 |
+
demucs_models: Dict[str, Dict] = self.models_url[DEMUCS_ARCH_TYPE]
|
223 |
+
self.demucs_download_choice = gr.Dropdown(choices=list(demucs_models.keys()), label=f"Select {DEMUCS_ARCH_TYPE} Model", interactive=True)
|
224 |
+
self.demucs_download_url = gr.Markdown()
|
225 |
+
|
226 |
+
self.demucs_download_choice.change(
|
227 |
+
lambda model: "\n".join([
|
228 |
+
"- " + md_url(url, text=filename) for filename, url in demucs_models[model].items()]),
|
229 |
+
inputs=self.demucs_download_choice,
|
230 |
+
outputs=self.demucs_download_url)
|
231 |
+
|
232 |
+
self.arch_choice.change(
|
233 |
+
self.arch_select_update, inputs=self.arch_choice,
|
234 |
+
outputs=[self.model_choice, self.arch_setting1, self.arch_setting2])
|
235 |
+
self.model_choice.change(
|
236 |
+
self.model_select_update, inputs=[self.arch_choice, self.model_choice],
|
237 |
+
outputs=[self.primary_stem_only, self.secondary_stem_only, self.primary_stem_out, self.secondary_stem_out])
|
238 |
+
|
239 |
+
self.checkbox_set_root_value(self.use_gpu, 'is_gpu_conversion_var')
|
240 |
+
self.checkbox_set_root_value(self.sample_mode, 'model_sample_mode_var')
|
241 |
+
self.set_checkboxes_exclusive(
|
242 |
+
[self.primary_stem_only, self.secondary_stem_only],
|
243 |
+
[lambda value: root.is_primary_stem_only_var.set(value), lambda value: root.is_secondary_stem_only_var.set(value)])
|
244 |
+
|
245 |
+
self.process_submit.click(
|
246 |
+
self.process,
|
247 |
+
inputs=[self.audio_in, self.input_filename, self.model_choice, self.arch_choice, self.arch_setting1, self.arch_setting2],
|
248 |
+
outputs=[self.primary_stem_out, self.secondary_stem_out, self.out_message])
|
249 |
+
|
250 |
+
def launch(self, **kwargs):
|
251 |
+
self.app.queue().launch(**kwargs)
|
252 |
+
|
253 |
+
|
254 |
+
uvr = UVRInterface()
|
255 |
+
uvr.cached_sources_clear()
|
256 |
+
|
257 |
+
webui = UVRWebUI(uvr, online_data_path='models/download_checks.json')
|
258 |
+
|
259 |
+
|
260 |
+
print(webui.models_url)
|
261 |
+
model_dict = webui.models_url
|
262 |
+
|
263 |
+
import os
|
264 |
+
import wget
|
265 |
+
|
266 |
+
for category, models in model_dict.items():
|
267 |
+
if category in ['VR Arc', 'MDX-Net']:
|
268 |
+
if category == 'VR Arc':
|
269 |
+
model_path = 'models/VR_Models'
|
270 |
+
elif category == 'MDX-Net':
|
271 |
+
model_path = 'models/MDX_Net_Models'
|
272 |
+
|
273 |
+
for model_name, model_url in models.items():
|
274 |
+
cmd = f"aria2c --optimize-concurrent-downloads --console-log-level=error --summary-interval=10 -j5 -x16 -s16 -k1M -c -d {model_path} -Z {model_url}"
|
275 |
+
os.system(cmd)
|
276 |
+
|
277 |
+
print("Models downloaded successfully.")
|
278 |
+
else:
|
279 |
+
print(f"Ignoring category: {category}")
|
280 |
+
|
281 |
+
|
282 |
+
|
283 |
+
|
284 |
+
webui = UVRWebUI(uvr, online_data_path='models/download_checks.json')
|
285 |
+
webui.launch()
|