Jarod Castillo commited on
Commit
88b57c0
1 Parent(s): e6e9e8c
.gitignore ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Jarod's Custom Template
2
+ .idea
3
+ .vscode
4
+ datasets
5
+ ignored
6
+
7
+ ### Python template
8
+ # Byte-compiled / optimized / DLL files
9
+ __pycache__/
10
+ *.py[cod]
11
+ *$py.class
12
+
13
+ # C extensions
14
+ *.so
15
+
16
+ # Distribution / packaging
17
+ .Python
18
+ build/
19
+ develop-eggs/
20
+ dist/
21
+ downloads/
22
+ eggs/
23
+ .eggs/
24
+ lib/
25
+ lib64/
26
+ parts/
27
+ sdist/
28
+ var/
29
+ wheels/
30
+ share/python-wheels/
31
+ *.egg-info/
32
+ .installed.cfg
33
+ *.egg
34
+ MANIFEST
35
+
36
+ # PyInstaller
37
+ # Usually these files are written by a python script from a template
38
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
39
+ *.manifest
40
+ *.spec
41
+
42
+ # Installer logs
43
+ pip-log.txt
44
+ pip-delete-this-directory.txt
45
+
46
+ # Unit test / coverage reports
47
+ htmlcov/
48
+ .tox/
49
+ .nox/
50
+ .coverage
51
+ .coverage.*
52
+ .cache
53
+ nosetests.xml
54
+ coverage.xml
55
+ *.cover
56
+ *.py,cover
57
+ .hypothesis/
58
+ .pytest_cache/
59
+ cover/
60
+
61
+ # Translations
62
+ *.mo
63
+ *.pot
64
+
65
+ # Django stuff:
66
+ *.log
67
+ local_settings.py
68
+ db.sqlite3
69
+ db.sqlite3-journal
70
+
71
+ # Flask stuff:
72
+ instance/
73
+ .webassets-cache
74
+
75
+ # Scrapy stuff:
76
+ .scrapy
77
+
78
+ # Sphinx documentation
79
+ docs/_build/
80
+
81
+ # PyBuilder
82
+ .pybuilder/
83
+ target/
84
+
85
+ # Jupyter Notebook
86
+ .ipynb_checkpoints
87
+
88
+ # IPython
89
+ profile_default/
90
+ ipython_config.py
91
+
92
+ # pyenv
93
+ # For a library or package, you might want to ignore these files since the code is
94
+ # intended to run in multiple environments; otherwise, check them in:
95
+ # .python-version
96
+
97
+ # pipenv
98
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
99
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
100
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
101
+ # install all needed dependencies.
102
+ #Pipfile.lock
103
+
104
+ # poetry
105
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
107
+ # commonly ignored for libraries.
108
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109
+ #poetry.lock
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ #pdm.lock
114
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115
+ # in version control.
116
+ # https://pdm.fming.dev/#use-with-ide
117
+ .pdm.toml
118
+
119
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
120
+ __pypackages__/
121
+
122
+ # Celery stuff
123
+ celerybeat-schedule
124
+ celerybeat.pid
125
+
126
+ # SageMath parsed files
127
+ *.sage.py
128
+
129
+ # Environments
130
+ .env
131
+ .venv
132
+ env/
133
+ venv/
134
+ ENV/
135
+ env.bak/
136
+ venv.bak/
137
+
138
+ # Spyder project settings
139
+ .spyderproject
140
+ .spyproject
141
+
142
+ # Rope project settings
143
+ .ropeproject
144
+
145
+ # mkdocs documentation
146
+ /site
147
+
148
+ # mypy
149
+ .mypy_cache/
150
+ .dmypy.json
151
+ dmypy.json
152
+
153
+ # Pyre type checker
154
+ .pyre/
155
+
156
+ # pytype static type analyzer
157
+ .pytype/
158
+
159
+ # Cython debug symbols
160
+ cython_debug/
161
+
162
+ # PyCharm
163
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
164
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
165
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
166
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
167
+ #.idea/
168
+
.streamlit/config.toml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [server]
2
+
3
+ # List of folders that should not be watched for changes. This
4
+ # impacts both "Run on Save" and @st.cache.
5
+
6
+ # Relative paths will be taken as relative to the current working directory.
7
+
8
+ # Example: ['/home/user1/env', 'relative/path/to/folder']
9
+
10
+ # Default: []
11
+ folderWatchBlacklist = []
12
+
13
+ # Change the type of file watcher used by Streamlit, or turn it off
14
+ # completely.
15
+
16
+ # Allowed values:
17
+ # * "auto" : Streamlit will attempt to use the watchdog module, and
18
+ # falls back to polling if watchdog is not available.
19
+ # * "watchdog" : Force Streamlit to use the watchdog module.
20
+ # * "poll" : Force Streamlit to always use polling.
21
+ # * "none" : Streamlit will not watch files.
22
+
23
+ # Default: "auto"
24
+ fileWatcherType = "auto"
25
+
26
+ # Symmetric key used to produce signed cookies. If deploying on multiple
27
+ # replicas, this should be set to the same value across all replicas to ensure
28
+ # they all share the same secret.
29
+
30
+ # Default: randomly generated secret key.
31
+ cookieSecret = "a-random-key-appears-here"
32
+
33
+ # If false, will attempt to open a browser window on start.
34
+
35
+ # Default: false unless (1) we are on a Linux box where DISPLAY is unset, or
36
+ # (2) we are running in the Streamlit Atom plugin.
37
+ # headless = false
38
+
39
+ # Automatically rerun script when the file is modified on disk.
40
+
41
+ # Default: false
42
+ runOnSave = false
43
+
44
+ # The address where the server will listen for client and browser
45
+ # connections. Use this if you want to bind the server to a specific address.
46
+ # If set, the server will only be accessible from this address, and not from
47
+ # any aliases (like localhost).
48
+
49
+ # Default: (unset)
50
+ # address =
51
+
52
+ # The port where the server will listen for browser connections.
53
+
54
+ # Default: 8501
55
+ port = 8501
56
+
57
+ # The base path for the URL where Streamlit should be served from.
58
+
59
+ # Default: ""
60
+ baseUrlPath = ""
61
+
62
+ # Enables support for Cross-Origin Resource Sharing (CORS) protection, for
63
+ # added security.
64
+
65
+ # Due to conflicts between CORS and XSRF, if `server.enableXsrfProtection` is
66
+ # on and `server.enableCORS` is off at the same time, we will prioritize
67
+ # `server.enableXsrfProtection`.
68
+
69
+ # Default: true
70
+ enableCORS = true
71
+
72
+ # Enables support for Cross-Site Request Forgery (XSRF) protection, for added
73
+ # security.
74
+
75
+ # Due to conflicts between CORS and XSRF, if `server.enableXsrfProtection` is
76
+ # on and `server.enableCORS` is off at the same time, we will prioritize
77
+ # `server.enableXsrfProtection`.
78
+
79
+ # Default: true
80
+ enableXsrfProtection = true
81
+
82
+ # Max size, in megabytes, for files uploaded with the file_uploader.
83
+
84
+ # Default: 200
85
+ maxUploadSize = 200
86
+
87
+ # Max size, in megabytes, of messages that can be sent via the WebSocket
88
+ # connection.
89
+
90
+ # Default: 200
91
+ maxMessageSize = 200
92
+
93
+ # Enables support for websocket compression.
94
+
95
+ # Default: false
96
+ enableWebsocketCompression = false
97
+
98
+ # Enable serving files from a `static` directory in the running app's
99
+ # directory.
100
+
101
+ # Default: false
102
+ enableStaticServing = false
103
+
104
+ # Server certificate file for connecting via HTTPS.
105
+ # Must be set at the same time as "server.sslKeyFile".
106
+
107
+ # ['DO NOT USE THIS OPTION IN A PRODUCTION ENVIRONMENT. It has not gone through
108
+ # security audits or performance tests. For the production environment, we
109
+ # recommend performing SSL termination by the load balancer or the reverse
110
+ # proxy.']
111
+ # sslCertFile =
112
+
113
+ # Cryptographic key file for connecting via HTTPS.
114
+ # Must be set at the same time as "server.sslCertFile".
115
+
116
+ # ['DO NOT USE THIS OPTION IN A PRODUCTION ENVIRONMENT. It has not gone through
117
+ # security audits or performance tests. For the production environment, we
118
+ # recommend performing SSL termination by the load balancer or the reverse
119
+ # proxy.']
120
+ # sslKeyFile =
README.md CHANGED
@@ -1,13 +1,70 @@
1
- ---
2
- title: Sing For Me
3
- emoji: 🌍
4
- colorFrom: blue
5
- colorTo: indigo
6
- sdk: streamlit
7
- sdk_version: 1.25.0
8
- app_file: app.py
9
- pinned: false
10
- license: openrail
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Vocal Remover
2
+
3
+ A web-based tool for removing vocals from audio files using deep learning.
4
+
5
+ ## Table of Contents
6
+
7
+ - [Overview](#overview)
8
+ - [Features](#features)
9
+ - [Installation](#installation)
10
+ - [Usage](#usage)
11
+ - [Demo](#demo)
12
+ - [Technologies Used](#technologies-used)
13
+ - [Contributing](#contributing)
14
+ - [License](#license)
15
+ - [Useful Research Papers](#useful-research-papers)
16
+
17
+ ## Overview
18
+
19
+ The Vocal Remover is a user-friendly web application that leverages deep learning models to remove vocals from audio files. It provides an easy and interactive way for users to upload their audio files and process them to obtain vocals-free versions.
20
+
21
+ ## Features
22
+
23
+ - Upload audio files in various formats (WAV, MP3, OGG, FLAC).
24
+ - Process audio files to remove vocals using a pre-trained deep learning model.
25
+ - Display a progress bar during audio processing.
26
+ - Play the original and processed audio files in the browser.
27
+ - Downloadable WAV file
28
+ - Clean and intuitive user interface.
29
+
30
+ ## Installation
31
+
32
+ 1. Clone this repository:
33
+
34
+ ```bash
35
+ git clone https://github.com/smotto/Sing-For-Me.git
36
+ cd Sing-For-Me
37
+
38
+ 2. Install the required Python packages:
39
+
40
+ ```bash
41
+ pip install -r requirements.txt
42
+
43
+ ## Usage
44
+ 1. Run the Streamlit app:
45
+
46
+ ```bash
47
+ streamlit run main.py
48
+
49
+ 2. Access the app in your web browser at http://localhost:8501.
50
+
51
+ ## Demo
52
+ For a live demonstration, visit Demo Link.
53
+
54
+ ## Technologies Used
55
+ * Python
56
+ * Streamlit
57
+ * PyTorch
58
+ * Soundfile and Librosa
59
+
60
+ ## Contributing
61
+ Contributions are welcome! If you have suggestions, bug reports, or feature requests, please open an issue or submit a pull request.
62
+
63
+ ## License
64
+ This project is licensed under the Apache 2.0 License.
65
+
66
+ ## Useful Research Papers
67
+ - [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597)
68
+ - [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499)
69
+ - [Wave-U-Net: A Multi-Scale Neural Network for End-to-End Audio Source Separation](https://arxiv.org/abs/1806.03185)
70
+ - [KUIELab-MDX-Net: A Two-Stream Neural Network for Music Demixing](https://arxiv.org/abs/2111.12203)
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standard Library
2
+ import os
3
+
4
+ # Third-Party
5
+ import streamlit as st
6
+
7
+ # Local
8
+ from src.models.MDX_net.kimvocal import KimVocal
9
+ from src.loader import Loader
10
+ from src.models.MDX_net.mdx_net import Conv_TDF_net_trimm
11
+
12
+ # Constants
13
+ from src.constants import ONNX_MODEL_PATH
14
+
15
+ INPUT_FOLDER = "./datasets/input"
16
+ OUTPUT_FOLDER = "./datasets/output"
17
+
18
+
19
+ def main():
20
+ # Set page configuration and theming
21
+ st.set_page_config(
22
+ page_title="Jarod's Vocal Remover",
23
+ page_icon="🎵",
24
+ )
25
+ st.title("Vocal Remover")
26
+
27
+ # Upload WAV file
28
+ uploaded_file = st.file_uploader(
29
+ "Upload an Audio File (WAV, MP3, OGG, FLAC)",
30
+ type=["wav", "mp3", "ogg", "flac"],
31
+ key="file_uploader",
32
+ )
33
+
34
+ if uploaded_file is not None:
35
+ uploaded_file_ext = uploaded_file.name.lower().split(".")[-1]
36
+ # Process the uploaded audio
37
+ st.subheader("Audio Processing")
38
+ st.write("Processing the uploaded audio file...")
39
+
40
+ # Display a progress bar while processing
41
+ progress_bar = st.progress(0)
42
+ progress_text = st.empty()
43
+
44
+ loader = Loader(INPUT_FOLDER, OUTPUT_FOLDER)
45
+ music_tensor, samplerate = loader.prepare_uploaded_file(
46
+ uploaded_file=uploaded_file
47
+ )
48
+
49
+ model_raw_python = Conv_TDF_net_trimm(
50
+ model_path=ONNX_MODEL_PATH,
51
+ use_onnx=True,
52
+ target_name="vocals",
53
+ L=11,
54
+ l=3,
55
+ g=48,
56
+ bn=8,
57
+ bias=False,
58
+ dim_f=11,
59
+ dim_t=8,
60
+ )
61
+
62
+ kimvocal = KimVocal()
63
+ vocals_tensor = kimvocal.demix_vocals(
64
+ music_tensor=music_tensor,
65
+ sample_rate=samplerate,
66
+ model=model_raw_python,
67
+ streamlit_progressbar=progress_bar,
68
+ )
69
+ vocals_array = vocals_tensor.numpy()
70
+
71
+ # Update progress
72
+ progress_bar.progress(100)
73
+ progress_text.text("Audio processing complete!")
74
+
75
+ # Display processed audio
76
+ st.subheader("Processed Audio")
77
+ st.audio(vocals_array, format="audio/wav", sample_rate=samplerate)
78
+
79
+
80
+ if __name__ == "__main__":
81
+ main()
pretrained_models/MDX_net/Kim_Vocal.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce74ef3b6a6024ce44211a07be9cf8bc6d87728cc852a68ab34eb8e58cde9c8b
3
+ size 66759214
pretrained_models/MDX_net/model_data/model_data.json ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0ddfc0eb5792638ad5dc27850236c246": {
3
+ "compensate": 1.035,
4
+ "mdx_dim_f_set": 2048,
5
+ "mdx_dim_t_set": 8,
6
+ "mdx_n_fft_scale_set": 6144,
7
+ "primary_stem": "Vocals"
8
+ },
9
+ "26d308f91f3423a67dc69a6d12a8793d": {
10
+ "compensate": 1.035,
11
+ "mdx_dim_f_set": 2048,
12
+ "mdx_dim_t_set": 9,
13
+ "mdx_n_fft_scale_set": 8192,
14
+ "primary_stem": "Other"
15
+ },
16
+ "2cdd429caac38f0194b133884160f2c6": {
17
+ "compensate": 1.045,
18
+ "mdx_dim_f_set": 3072,
19
+ "mdx_dim_t_set": 8,
20
+ "mdx_n_fft_scale_set": 7680,
21
+ "primary_stem": "Instrumental"
22
+ },
23
+ "2f5501189a2f6db6349916fabe8c90de": {
24
+ "compensate": 1.035,
25
+ "mdx_dim_f_set": 2048,
26
+ "mdx_dim_t_set": 8,
27
+ "mdx_n_fft_scale_set": 6144,
28
+ "primary_stem": "Vocals"
29
+ },
30
+ "398580b6d5d973af3120df54cee6759d": {
31
+ "compensate": 1.75,
32
+ "mdx_dim_f_set": 3072,
33
+ "mdx_dim_t_set": 8,
34
+ "mdx_n_fft_scale_set": 7680,
35
+ "primary_stem": "Vocals"
36
+ },
37
+ "488b3e6f8bd3717d9d7c428476be2d75": {
38
+ "compensate": 1.035,
39
+ "mdx_dim_f_set": 3072,
40
+ "mdx_dim_t_set": 8,
41
+ "mdx_n_fft_scale_set": 7680,
42
+ "primary_stem": "Instrumental"
43
+ },
44
+ "4910e7827f335048bdac11fa967772f9": {
45
+ "compensate": 1.035,
46
+ "mdx_dim_f_set": 2048,
47
+ "mdx_dim_t_set": 7,
48
+ "mdx_n_fft_scale_set": 4096,
49
+ "primary_stem": "Drums"
50
+ },
51
+ "53c4baf4d12c3e6c3831bb8f5b532b93": {
52
+ "compensate": 1.043,
53
+ "mdx_dim_f_set": 3072,
54
+ "mdx_dim_t_set": 8,
55
+ "mdx_n_fft_scale_set": 7680,
56
+ "primary_stem": "Vocals"
57
+ },
58
+ "5d343409ef0df48c7d78cce9f0106781": {
59
+ "compensate": 1.075,
60
+ "mdx_dim_f_set": 3072,
61
+ "mdx_dim_t_set": 8,
62
+ "mdx_n_fft_scale_set": 7680,
63
+ "primary_stem": "Vocals"
64
+ },
65
+ "5f6483271e1efb9bfb59e4a3e6d4d098": {
66
+ "compensate": 1.035,
67
+ "mdx_dim_f_set": 2048,
68
+ "mdx_dim_t_set": 9,
69
+ "mdx_n_fft_scale_set": 6144,
70
+ "primary_stem": "Vocals"
71
+ },
72
+ "65ab5919372a128e4167f5e01a8fda85": {
73
+ "compensate": 1.035,
74
+ "mdx_dim_f_set": 2048,
75
+ "mdx_dim_t_set": 8,
76
+ "mdx_n_fft_scale_set": 8192,
77
+ "primary_stem": "Other"
78
+ },
79
+ "6703e39f36f18aa7855ee1047765621d": {
80
+ "compensate": 1.035,
81
+ "mdx_dim_f_set": 2048,
82
+ "mdx_dim_t_set": 9,
83
+ "mdx_n_fft_scale_set": 16384,
84
+ "primary_stem": "Bass"
85
+ },
86
+ "6b31de20e84392859a3d09d43f089515": {
87
+ "compensate": 1.035,
88
+ "mdx_dim_f_set": 2048,
89
+ "mdx_dim_t_set": 8,
90
+ "mdx_n_fft_scale_set": 6144,
91
+ "primary_stem": "Vocals"
92
+ },
93
+ "867595e9de46f6ab699008295df62798": {
94
+ "compensate": 1.03,
95
+ "mdx_dim_f_set": 3072,
96
+ "mdx_dim_t_set": 8,
97
+ "mdx_n_fft_scale_set": 7680,
98
+ "primary_stem": "Vocals"
99
+ },
100
+ "a3cd63058945e777505c01d2507daf37": {
101
+ "compensate": 1.03,
102
+ "mdx_dim_f_set": 2048,
103
+ "mdx_dim_t_set": 8,
104
+ "mdx_n_fft_scale_set": 6144,
105
+ "primary_stem": "Vocals"
106
+ },
107
+ "b33d9b3950b6cbf5fe90a32608924700": {
108
+ "compensate": 1.03,
109
+ "mdx_dim_f_set": 3072,
110
+ "mdx_dim_t_set": 8,
111
+ "mdx_n_fft_scale_set": 7680,
112
+ "primary_stem": "Vocals"
113
+ },
114
+ "c3b29bdce8c4fa17ec609e16220330ab": {
115
+ "compensate": 1.035,
116
+ "mdx_dim_f_set": 2048,
117
+ "mdx_dim_t_set": 8,
118
+ "mdx_n_fft_scale_set": 16384,
119
+ "primary_stem": "Bass"
120
+ },
121
+ "ceed671467c1f64ebdfac8a2490d0d52": {
122
+ "compensate": 1.035,
123
+ "mdx_dim_f_set": 3072,
124
+ "mdx_dim_t_set": 8,
125
+ "mdx_n_fft_scale_set": 7680,
126
+ "primary_stem": "Instrumental"
127
+ },
128
+ "d2a1376f310e4f7fa37fb9b5774eb701": {
129
+ "compensate": 1.035,
130
+ "mdx_dim_f_set": 3072,
131
+ "mdx_dim_t_set": 8,
132
+ "mdx_n_fft_scale_set": 7680,
133
+ "primary_stem": "Instrumental"
134
+ },
135
+ "d7bff498db9324db933d913388cba6be": {
136
+ "compensate": 1.035,
137
+ "mdx_dim_f_set": 2048,
138
+ "mdx_dim_t_set": 8,
139
+ "mdx_n_fft_scale_set": 6144,
140
+ "primary_stem": "Vocals"
141
+ },
142
+ "d94058f8c7f1fae4164868ae8ae66b20": {
143
+ "compensate": 1.035,
144
+ "mdx_dim_f_set": 2048,
145
+ "mdx_dim_t_set": 8,
146
+ "mdx_n_fft_scale_set": 6144,
147
+ "primary_stem": "Vocals"
148
+ },
149
+ "dc41ede5961d50f277eb846db17f5319": {
150
+ "compensate": 1.035,
151
+ "mdx_dim_f_set": 2048,
152
+ "mdx_dim_t_set": 9,
153
+ "mdx_n_fft_scale_set": 4096,
154
+ "primary_stem": "Drums"
155
+ },
156
+ "e5572e58abf111f80d8241d2e44e7fa4": {
157
+ "compensate": 1.028,
158
+ "mdx_dim_f_set": 3072,
159
+ "mdx_dim_t_set": 8,
160
+ "mdx_n_fft_scale_set": 7680,
161
+ "primary_stem": "Instrumental"
162
+ },
163
+ "e7324c873b1f615c35c1967f912db92a": {
164
+ "compensate": 1.03,
165
+ "mdx_dim_f_set": 3072,
166
+ "mdx_dim_t_set": 8,
167
+ "mdx_n_fft_scale_set": 7680,
168
+ "primary_stem": "Vocals"
169
+ },
170
+ "1c56ec0224f1d559c42fd6fd2a67b154": {
171
+ "compensate": 1.025,
172
+ "mdx_dim_f_set": 2048,
173
+ "mdx_dim_t_set": 8,
174
+ "mdx_n_fft_scale_set": 5120,
175
+ "primary_stem": "Instrumental"
176
+ },
177
+ "f2df6d6863d8f435436d8b561594ff49": {
178
+ "compensate": 1.035,
179
+ "mdx_dim_f_set": 3072,
180
+ "mdx_dim_t_set": 8,
181
+ "mdx_n_fft_scale_set": 7680,
182
+ "primary_stem": "Instrumental"
183
+ },
184
+ "b06327a00d5e5fbc7d96e1781bbdb596": {
185
+ "compensate": 1.035,
186
+ "mdx_dim_f_set": 3072,
187
+ "mdx_dim_t_set": 8,
188
+ "mdx_n_fft_scale_set": 6144,
189
+ "primary_stem": "Instrumental"
190
+ },
191
+ "94ff780b977d3ca07c7a343dab2e25dd": {
192
+ "compensate": 1.039,
193
+ "mdx_dim_f_set": 3072,
194
+ "mdx_dim_t_set": 8,
195
+ "mdx_n_fft_scale_set": 6144,
196
+ "primary_stem": "Instrumental"
197
+ },
198
+ "73492b58195c3b52d34590d5474452f6": {
199
+ "compensate": 1.043,
200
+ "mdx_dim_f_set": 3072,
201
+ "mdx_dim_t_set": 8,
202
+ "mdx_n_fft_scale_set": 7680,
203
+ "primary_stem": "Vocals"
204
+ },
205
+ "970b3f9492014d18fefeedfe4773cb42": {
206
+ "compensate": 1.009,
207
+ "mdx_dim_f_set": 3072,
208
+ "mdx_dim_t_set": 8,
209
+ "mdx_n_fft_scale_set": 7680,
210
+ "primary_stem": "Vocals"
211
+ },
212
+ "1d64a6d2c30f709b8c9b4ce1366d96ee": {
213
+ "compensate": 1.065,
214
+ "mdx_dim_f_set": 2048,
215
+ "mdx_dim_t_set": 8,
216
+ "mdx_n_fft_scale_set": 5120,
217
+ "primary_stem": "Instrumental"
218
+ },
219
+ "203f2a3955221b64df85a41af87cf8f0": {
220
+ "compensate": 1.035,
221
+ "mdx_dim_f_set": 3072,
222
+ "mdx_dim_t_set": 8,
223
+ "mdx_n_fft_scale_set": 6144,
224
+ "primary_stem": "Instrumental"
225
+ },
226
+ "291c2049608edb52648b96e27eb80e95": {
227
+ "compensate": 1.035,
228
+ "mdx_dim_f_set": 3072,
229
+ "mdx_dim_t_set": 8,
230
+ "mdx_n_fft_scale_set": 6144,
231
+ "primary_stem": "Instrumental"
232
+ },
233
+ "ead8d05dab12ec571d67549b3aab03fc": {
234
+ "compensate": 1.035,
235
+ "mdx_dim_f_set": 3072,
236
+ "mdx_dim_t_set": 8,
237
+ "mdx_n_fft_scale_set": 6144,
238
+ "primary_stem": "Instrumental"
239
+ },
240
+ "cc63408db3d80b4d85b0287d1d7c9632": {
241
+ "compensate": 1.033,
242
+ "mdx_dim_f_set": 3072,
243
+ "mdx_dim_t_set": 8,
244
+ "mdx_n_fft_scale_set": 6144,
245
+ "primary_stem": "Instrumental"
246
+ },
247
+ "cd5b2989ad863f116c855db1dfe24e39": {
248
+ "compensate": 1.035,
249
+ "mdx_dim_f_set": 3072,
250
+ "mdx_dim_t_set": 9,
251
+ "mdx_n_fft_scale_set": 6144,
252
+ "primary_stem": "Other"
253
+ },
254
+ "55657dd70583b0fedfba5f67df11d711": {
255
+ "compensate": 1.022,
256
+ "mdx_dim_f_set": 3072,
257
+ "mdx_dim_t_set": 8,
258
+ "mdx_n_fft_scale_set": 6144,
259
+ "primary_stem": "Instrumental"
260
+ },
261
+ "b6bccda408a436db8500083ef3491e8b": {
262
+ "compensate": 1.02,
263
+ "mdx_dim_f_set": 3072,
264
+ "mdx_dim_t_set": 8,
265
+ "mdx_n_fft_scale_set": 7680,
266
+ "primary_stem": "Instrumental"
267
+ },
268
+ "8a88db95c7fb5dbe6a095ff2ffb428b1": {
269
+ "compensate": 1.026,
270
+ "mdx_dim_f_set": 2048,
271
+ "mdx_dim_t_set": 8,
272
+ "mdx_n_fft_scale_set": 5120,
273
+ "primary_stem": "Instrumental"
274
+ },
275
+ "b78da4afc6512f98e4756f5977f5c6b9": {
276
+ "compensate": 1.021,
277
+ "mdx_dim_f_set": 3072,
278
+ "mdx_dim_t_set": 8,
279
+ "mdx_n_fft_scale_set": 7680,
280
+ "primary_stem": "Instrumental"
281
+ },
282
+ "77d07b2667ddf05b9e3175941b4454a0": {
283
+ "compensate": 1.021,
284
+ "mdx_dim_f_set": 3072,
285
+ "mdx_dim_t_set": 8,
286
+ "mdx_n_fft_scale_set": 7680,
287
+ "primary_stem": "Vocals"
288
+ },
289
+ "2154254ee89b2945b97a7efed6e88820": {
290
+ "config_yaml": "model_2_stem_061321.yaml"
291
+ },
292
+ "063aadd735d58150722926dcbf5852a9": {
293
+ "config_yaml": "model_2_stem_061321.yaml"
294
+ },
295
+ "fe96801369f6a148df2720f5ced88c19": {
296
+ "config_yaml": "model3.yaml"
297
+ },
298
+ "02e8b226f85fb566e5db894b9931c640": {
299
+ "config_yaml": "model2.yaml"
300
+ },
301
+ "e3de6d861635ab9c1d766149edd680d6": {
302
+ "config_yaml": "model1.yaml"
303
+ },
304
+ "3f2936c554ab73ce2e396d54636bd373": {
305
+ "config_yaml": "modelB.yaml"
306
+ },
307
+ "890d0f6f82d7574bca741a9e8bcb8168": {
308
+ "config_yaml": "modelB.yaml"
309
+ },
310
+ "63a3cb8c37c474681049be4ad1ba8815": {
311
+ "config_yaml": "modelB.yaml"
312
+ },
313
+ "a7fc5d719743c7fd6b61bd2b4d48b9f0": {
314
+ "config_yaml": "modelA.yaml"
315
+ },
316
+ "3567f3dee6e77bf366fcb1c7b8bc3745": {
317
+ "config_yaml": "modelA.yaml"
318
+ },
319
+ "a28f4d717bd0d34cd2ff7a3b0a3d065e": {
320
+ "config_yaml": "modelA.yaml"
321
+ },
322
+ "c9971a18da20911822593dc81caa8be9": {
323
+ "config_yaml": "sndfx.yaml"
324
+ },
325
+ "57d94d5ed705460d21c75a5ac829a605": {
326
+ "config_yaml": "sndfx.yaml"
327
+ },
328
+ "e7a25f8764f25a52c1b96c4946e66ba2": {
329
+ "config_yaml": "sndfx.yaml"
330
+ },
331
+ "104081d24e37217086ce5fde09147ee1": {
332
+ "config_yaml": "model_2_stem_061321.yaml"
333
+ },
334
+ "1e6165b601539f38d0a9330f3facffeb": {
335
+ "config_yaml": "model_2_stem_061321.yaml"
336
+ },
337
+ "fe0108464ce0d8271be5ab810891bd7c": {
338
+ "config_yaml": "model_2_stem_full_band.yaml"
339
+ }
340
+ }
pretrained_models/MDX_net/model_data/model_name_mapper.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "UVR_MDXNET_1_9703": "UVR-MDX-NET 1",
3
+ "UVR_MDXNET_2_9682": "UVR-MDX-NET 2",
4
+ "UVR_MDXNET_3_9662": "UVR-MDX-NET 3",
5
+ "UVR_MDXNET_KARA": "UVR-MDX-NET Karaoke",
6
+ "UVR_MDXNET_Main": "UVR-MDX-NET Main",
7
+ "UVR-MDX-NET-Inst_1": "UVR-MDX-NET Inst 1",
8
+ "UVR-MDX-NET-Inst_2": "UVR-MDX-NET Inst 2",
9
+ "UVR-MDX-NET-Inst_3": "UVR-MDX-NET Inst 3",
10
+ "UVR-MDX-NET-Inst_4": "UVR-MDX-NET Inst 4",
11
+ "UVR-MDX-NET-Inst_Main": "UVR-MDX-NET Inst Main",
12
+ "UVR-MDX-NET-Inst_Main_2": "UVR-MDX-NET Inst Main 2",
13
+ "UVR-MDX-NET-Inst_HQ_1": "UVR-MDX-NET Inst HQ 1",
14
+ "UVR-MDX-NET-Inst_HQ_2": "UVR-MDX-NET Inst HQ 2",
15
+ "UVR-MDX-NET-Inst_HQ_3": "UVR-MDX-NET Inst HQ 3",
16
+ "UVR_MDXNET_KARA_2": "UVR-MDX-NET Karaoke 2",
17
+ "UVR-MDX-NET-Voc_FT": "UVR-MDX-NET Voc FT",
18
+ "Kim_Vocal_1": "Kim Vocal 1",
19
+ "Kim_Vocal_2": "Kim Vocal 2",
20
+ "Kim_Inst": "Kim Inst",
21
+ "Reverb_HQ_By_FoxJoy": "Reverb HQ"
22
+ }
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pip~=23.2.1
2
+ torch~=2.0.1
3
+ onnxruntime~=1.15.1
4
+ librosa~=0.10.0.post2
5
+ soundfile~=0.12.1
6
+ numpy~=1.24.4
7
+ scipy~=1.11.1
8
+ streamlit~=1.25.0
src/Sound_Feature_Extraction/short_time_fourier_transform.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class STFT:
5
+ def __init__(self, n_fft, hop_length, dim_f):
6
+ self.n_fft = n_fft
7
+ self.hop_length = hop_length
8
+ self.window = torch.hann_window(window_length=n_fft, periodic=True)
9
+ self.dim_f = dim_f
10
+
11
+ def __call__(self, x):
12
+ window = self.window.to(x.device)
13
+ batch_dims = x.shape[:-2]
14
+ c, t = x.shape[-2:]
15
+ x = x.reshape([-1, t])
16
+ x = torch.stft(
17
+ x,
18
+ n_fft=self.n_fft,
19
+ hop_length=self.hop_length,
20
+ window=window,
21
+ center=True,
22
+ return_complex=True,
23
+ )
24
+ x = torch.view_as_real(x)
25
+ x = x.permute([0, 3, 1, 2])
26
+ x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape(
27
+ [*batch_dims, c * 2, -1, x.shape[-1]]
28
+ )
29
+ return x[..., : self.dim_f, :]
30
+
31
+ def inverse(self, x):
32
+ window = self.window.to(x.device)
33
+ batch_dims = x.shape[:-3]
34
+ c, f, t = x.shape[-3:]
35
+ n = self.n_fft // 2 + 1
36
+ f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device)
37
+ x = torch.cat([x, f_pad], -2)
38
+ x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t])
39
+ x = x.permute([0, 2, 3, 1])
40
+ x = x.contiguous()
41
+ t_complex = torch.view_as_complex(x)
42
+ x = torch.istft(
43
+ t_complex,
44
+ n_fft=self.n_fft,
45
+ hop_length=self.hop_length,
46
+ window=window,
47
+ center=True,
48
+ )
49
+ x = x.reshape([*batch_dims, 2, -1])
50
+ return x
src/constants.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Third-party
2
+ import torch
3
+
4
+ # Global Variables
5
+ COMPUTATION_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
6
+ EXECUTION_PROVIDER_LIST = ["CUDAExecutionProvider", "CPUExecutionProvider"]
7
+ ONNX_MODEL_PATH = "./models/MDX_Net_Models/Kim_Vocal.onnx"
8
+ INPUT_FOLDER = "./datasets/input"
9
+ OUTPUT_FOLDER = "./datasets/output"
src/infer.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standard Library Imports
2
+ import os
3
+ import subprocess
4
+
5
+ # Third Party Imports
6
+ import torch
7
+ import onnxruntime as ort
8
+
9
+ # Local Imports
10
+ from models.MDX_net.mdx_net import Conv_TDF_net_trimm
11
+ from loader import Loader
12
+
13
+ vocal_path = r"./datasets/output/vocals.wav"
14
+
15
+ # Global Variables
16
+ COMPUTATION_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
+
18
+
19
+ def main():
20
+ print(COMPUTATION_DEVICE)
src/loader.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standard Library
2
+ import os
3
+
4
+ # Explicit Typing
5
+ from typing import Tuple
6
+ from numpy import ndarray
7
+
8
+ # Third-party
9
+ import librosa
10
+ import torch
11
+
12
+
13
+ class Loader:
14
+ """Loading sound files into a usable format for pytorch"""
15
+
16
+ def __init__(self, INPUT_FOLDER, OUTPUT_FOLDER):
17
+ self.input = INPUT_FOLDER
18
+ self.output = OUTPUT_FOLDER
19
+ def load_wav(self, name) -> Tuple[ndarray, int]:
20
+ music_array, samplerate = librosa.load(
21
+ os.path.join(self.input, name + ".wav"), mono=False, sr=44100
22
+ )
23
+ return music_array, samplerate
24
+
25
+ def prepare_uploaded_file(self, uploaded_file) -> Tuple[torch.Tensor, int]:
26
+ music_array, samplerate = librosa.load(uploaded_file, mono=False, sr=44100)
27
+
28
+ music_tensor = torch.tensor(music_array, dtype=torch.float32)
29
+
30
+ return music_tensor, samplerate
src/models/MDX_net/kimvocal.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standard Library Imports
2
+
3
+ # Third Party Imports
4
+ import torch
5
+ import onnxruntime as ort
6
+
7
+ # Local Imports
8
+ from src.models.MDX_net.mdx_net import Conv_TDF_net_trimm
9
+ from src.loader import Loader
10
+
11
+ # Global Variables
12
+ from src.constants import EXECUTION_PROVIDER_LIST, COMPUTATION_DEVICE, ONNX_MODEL_PATH
13
+
14
+
15
+ class KimVocal:
16
+ """
17
+ TODO: Put something here for flexibility purposes (model types).
18
+ """
19
+
20
+ def __init__(self):
21
+ pass
22
+
23
+ def demix_vocals(self, music_tensor, sample_rate, model, streamlit_progressbar):
24
+ """
25
+ Removing vocals using a ONNX model.
26
+
27
+ Args:
28
+ music_tensor (torch.Tensor): Input tensor.
29
+ model (torch.nn): Model used for inferring.
30
+
31
+ Returns:
32
+ torch.Tensor: Output tensor after passing through the network.
33
+ """
34
+ number_of_samples = music_tensor.shape[1]
35
+ overlap = model.overlap
36
+ # Calculate chunk_size and gen_size based on the sample rate
37
+ chunk_size = model.chunk_size
38
+ gen_size = chunk_size - 2 * overlap
39
+ pad_size = gen_size - number_of_samples % gen_size
40
+ mix_padded = torch.cat(
41
+ [torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size + overlap)],
42
+ 1,
43
+ )
44
+
45
+ # Start running the session for the model
46
+ ort_session = ort.InferenceSession(ONNX_MODEL_PATH, providers=EXECUTION_PROVIDER_LIST)
47
+
48
+ # TODO: any way to optimize against silence? I think that's what skips are for, gotta double check.
49
+ # process one chunk at a time (batch_size=1)
50
+ demixed_chunks = []
51
+ i = 0
52
+ while i < number_of_samples + pad_size:
53
+ # Progress Bar
54
+ streamlit_progressbar.progress(i / (number_of_samples + pad_size))
55
+
56
+ # Computation
57
+ chunk = mix_padded[:, i : i + chunk_size]
58
+ x = model.stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE))
59
+ with torch.no_grad():
60
+ x = torch.tensor(ort_session.run(None, {"input": x.cpu().numpy()})[0])
61
+ x = model.stft.inverse(x).squeeze(0)
62
+ x = x[..., overlap:-overlap]
63
+ demixed_chunks.append(x)
64
+ i += gen_size
65
+
66
+ vocals_output = torch.cat(demixed_chunks, -1)[..., :-pad_size].cpu()
67
+
68
+ return vocals_output
69
+
70
+
71
+ if __name__ == "__main__":
72
+ kimvocal = KimVocal()
73
+ kimvocal.main()
src/models/MDX_net/mdx_net.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Third-party
2
+ import torch
3
+ import torch.nn as nn
4
+
5
+ # Local
6
+ from src.Sound_Feature_Extraction.short_time_fourier_transform import STFT
7
+
8
+ COMPUTATION_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
9
+
10
+
11
+ class Conv_TDF(nn.Module):
12
+ """
13
+ Convolutional Time-Domain Filter (TDF) Module.
14
+
15
+ Args:
16
+ c (int): The number of input and output channels for the convolutional layers.
17
+ l (int): The number of convolutional layers within the module.
18
+ f (int): The number of features (or units) in the time-domain filter.
19
+ k (int): The size of the convolutional kernels (filters).
20
+ bn (int or None): Batch normalization factor (controls TDF behavior). If None, TDF is not used.
21
+ bias (bool): A boolean flag indicating whether bias terms are included in the linear layers.
22
+
23
+ Attributes:
24
+ use_tdf (bool): Flag indicating whether TDF is used.
25
+
26
+ Methods:
27
+ forward(x): Forward pass through the TDF module.
28
+ """
29
+
30
+ def __init__(self, c, l, f, k, bn, bias=True):
31
+ super(Conv_TDF, self).__init__()
32
+
33
+ # Determine whether to use TDF (Time-Domain Filter)
34
+ self.use_tdf = bn is not None
35
+
36
+ # Define a list of convolutional layers within the module
37
+ self.H = nn.ModuleList()
38
+ for i in range(l):
39
+ self.H.append(
40
+ nn.Sequential(
41
+ nn.Conv2d(
42
+ in_channels=c,
43
+ out_channels=c,
44
+ kernel_size=k,
45
+ stride=1,
46
+ padding=k // 2,
47
+ ),
48
+ nn.GroupNorm(2, c),
49
+ nn.ReLU(),
50
+ )
51
+ )
52
+
53
+ # Define the Time-Domain Filter (TDF) layers if enabled
54
+ if self.use_tdf:
55
+ if bn == 0:
56
+ self.tdf = nn.Sequential(
57
+ nn.Linear(f, f, bias=bias), nn.GroupNorm(2, c), nn.ReLU()
58
+ )
59
+ else:
60
+ self.tdf = nn.Sequential(
61
+ nn.Linear(f, f // bn, bias=bias),
62
+ nn.GroupNorm(2, c),
63
+ nn.ReLU(),
64
+ nn.Linear(f // bn, f, bias=bias),
65
+ nn.GroupNorm(2, c),
66
+ nn.ReLU(),
67
+ )
68
+
69
+ def forward(self, x):
70
+ # Apply the convolutional layers sequentially
71
+ for h in self.H:
72
+ x = h(x)
73
+
74
+ # Apply the Time-Domain Filter (TDF) if enabled, and add the result to the orignal input
75
+ return x + self.tdf(x) if self.use_tdf else x
76
+
77
+
78
+ class Conv_TDF_net_trimm(nn.Module):
79
+ """
80
+ Convolutional Time-Domain Filter (TDF) Network with Trimming.
81
+
82
+ Args:
83
+ L (int): This parameter controls the number of down-sampling (DS) blocks in the network.
84
+ It's divided by 2 to determine how many DS blocks should be created.
85
+ l (int): This parameter represents the number of convolutional layers (or filters) within each dense (fully connected) block.
86
+ g (int): This parameter specifies the number of output channels for the first convolutional layer and is also used to determine the number of channels for subsequent layers in the network.
87
+ dim_f (int): This parameter represents the number of frequency bins (spectrogram columns) in the input audio data.
88
+ dim_t (int): This parameter represents the number of time frames (spectrogram rows) in the input audio data.
89
+ k (int): This parameter specifies the size of convolutional kernels (filters) used in the network's convolutional layers.
90
+ bn (int or None): This parameter controls whether batch normalization is used in the network.
91
+ If it's None, batch normalization may or may not be used based on other conditions in the code.
92
+ bias (bool): This parameter is a boolean flag that controls whether bias terms are included in the convolutional layers.
93
+ overlap (int): This parameter specifies the amount of overlap between consecutive chunks of audio data during processing.
94
+
95
+ Attributes:
96
+ n (int): The calculated number of down-sampling (DS) blocks.
97
+ dim_f (int): The number of frequency bins (spectrogram columns) in the input audio data.
98
+ dim_t (int): The number of time frames (spectrogram rows) in the input audio data.
99
+ n_fft (int): The size of the Fast Fourier Transform (FFT) window.
100
+ hop (int): The hop size used in the STFT calculations.
101
+ n_bins (int): The number of bins in the frequency domain.
102
+ chunk_size (int): The size of each chunk of audio data.
103
+ target_name (str): The name of the target instrument being separated.
104
+ overlap (int): The amount of overlap between consecutive chunks of audio data during processing.
105
+
106
+ Methods:
107
+ forward(x): Forward pass through the Conv_TDF_net_trimm network.
108
+ """
109
+
110
+ def __init__(
111
+ self,
112
+ model_path,
113
+ use_onnx,
114
+ target_name,
115
+ L,
116
+ l,
117
+ g,
118
+ dim_f,
119
+ dim_t,
120
+ k=3,
121
+ hop=1024,
122
+ bn=None,
123
+ bias=True,
124
+ overlap=1500,
125
+ ):
126
+ super(Conv_TDF_net_trimm, self).__init__()
127
+ # Dictionary specifying the scale for the number of FFT bins for different target names
128
+ n_fft_scale = {"vocals": 3, "*": 2}
129
+
130
+ # Number of input and output channels for the initial and final convolutional layers
131
+ out_c = in_c = 4
132
+
133
+ # Number of down-sampling (DS) blocks
134
+ self.n = L // 2
135
+
136
+ # Dimensions of the frequency and time axes of the input data
137
+ self.dim_f = 3072
138
+ self.dim_t = 256
139
+
140
+ # Number of FFT bins (frequencies) and hop size for the Short-Time Fourier Transform (STFT)
141
+ self.n_fft = 7680
142
+ self.hop = hop
143
+ self.n_bins = self.n_fft // 2 + 1
144
+
145
+ # Chunk size used for processing
146
+ self.chunk_size = hop * (self.dim_t - 1)
147
+
148
+ # Target name for the model
149
+ self.target_name = target_name
150
+
151
+ # Overlap between consecutive chunks of audio data during processing
152
+ self.overlap = overlap
153
+
154
+ # STFT module for audio processing
155
+ self.stft = STFT(self.n_fft, self.hop, self.dim_f)
156
+
157
+ # Check if ONNX representation of the model should be used
158
+ if not use_onnx:
159
+ # First convolutional layer
160
+ self.first_conv = nn.Sequential(
161
+ nn.Conv2d(in_channels=in_c, out_channels=g, kernel_size=1, stride=1),
162
+ nn.BatchNorm2d(g),
163
+ nn.ReLU(),
164
+ )
165
+
166
+ # Initialize variables for dense (fully connected) blocks and downsampling (DS) blocks
167
+ f = self.dim_f
168
+ c = g
169
+ self.ds_dense = nn.ModuleList()
170
+ self.ds = nn.ModuleList()
171
+
172
+ # Loop through down-sampling (DS) blocks
173
+ for i in range(self.n):
174
+ # Create dense (fully connected) block for down-sampling
175
+ self.ds_dense.append(Conv_TDF(c, l, f, k, bn, bias=bias))
176
+
177
+ # Create down-sampling (DS) block
178
+ scale = (2, 2)
179
+ self.ds.append(
180
+ nn.Sequential(
181
+ nn.Conv2d(
182
+ in_channels=c,
183
+ out_channels=c + g,
184
+ kernel_size=scale,
185
+ stride=scale,
186
+ ),
187
+ nn.BatchNorm2d(c + g),
188
+ nn.ReLU(),
189
+ )
190
+ )
191
+ f = f // 2
192
+ c += g
193
+
194
+ # Middle dense (fully connected block)
195
+ self.mid_dense = Conv_TDF(c, l, f, k, bn, bias=bias)
196
+
197
+ # If batch normalization is not specified and mid_tdf is True, use Conv_TDF with bn=0 and bias=False
198
+ if bn is None and mid_tdf:
199
+ self.mid_dense = Conv_TDF(c, l, f, k, bn=0, bias=False)
200
+
201
+ # Initialize variables for up-sampling (US) blocks
202
+ self.us_dense = nn.ModuleList()
203
+ self.us = nn.ModuleList()
204
+
205
+ # Loop through up-sampling (US) blocks
206
+ for i in range(self.n):
207
+ scale = (2, 2)
208
+ # Create up-sampling (US) block
209
+ self.us.append(
210
+ nn.Sequential(
211
+ nn.ConvTranspose2d(
212
+ in_channels=c,
213
+ out_channels=c - g,
214
+ kernel_size=scale,
215
+ stride=scale,
216
+ ),
217
+ nn.BatchNorm2d(c - g),
218
+ nn.ReLU(),
219
+ )
220
+ )
221
+ f = f * 2
222
+ c -= g
223
+
224
+ # Create dense (fully connected) block for up-sampling
225
+ self.us_dense.append(Conv_TDF(c, l, f, k, bn, bias=bias))
226
+
227
+ # Final convolutional layer
228
+ self.final_conv = nn.Sequential(
229
+ nn.Conv2d(in_channels=c, out_channels=out_c, kernel_size=1, stride=1),
230
+ )
231
+
232
+ try:
233
+ # Load model state from a file
234
+ self.load_state_dict(
235
+ torch.load(
236
+ f"{model_path}/{target_name}.pt",
237
+ map_location=COMPUTATION_DEVICE,
238
+ )
239
+ )
240
+ print(f"Loading model ({target_name})")
241
+ except FileNotFoundError:
242
+ print(f"Random init ({target_name})")
243
+
244
+ def forward(self, x):
245
+ """
246
+ Forward pass through the Conv_TDF_net_trimm network.
247
+
248
+ Args:
249
+ x (torch.Tensor): Input tensor.
250
+
251
+ Returns:
252
+ torch.Tensor: Output tensor after passing through the network.
253
+ """
254
+ x = self.first_conv(x)
255
+
256
+ x = x.transpose(-1, -2)
257
+
258
+ ds_outputs = []
259
+ for i in range(self.n):
260
+ x = self.ds_dense[i](x)
261
+ ds_outputs.append(x)
262
+ x = self.ds[i](x)
263
+
264
+ x = self.mid_dense(x)
265
+
266
+ for i in range(self.n):
267
+ x = self.us[i](x)
268
+ x *= ds_outputs[-i - 1]
269
+ x = self.us_dense[i](x)
270
+
271
+ x = x.transpose(-1, -2)
272
+
273
+ x = self.final_conv(x)
274
+
275
+ return x
src/models/Pitch_Feature_Extraction/rmvpe.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, torch, numpy as np, traceback, pdb
2
+ import torch.nn as nn
3
+ from time import time as ttime
4
+ import torch.nn.functional as F
5
+
6
+
7
+ class BiGRU(nn.Module):
8
+ def __init__(self, input_features, hidden_features, num_layers):
9
+ super(BiGRU, self).__init__()
10
+ self.gru = nn.GRU(
11
+ input_features,
12
+ hidden_features,
13
+ num_layers=num_layers,
14
+ batch_first=True,
15
+ bidirectional=True,
16
+ )
17
+
18
+ def forward(self, x):
19
+ return self.gru(x)[0]
20
+
21
+
22
+ class ConvBlockRes(nn.Module):
23
+ def __init__(self, in_channels, out_channels, momentum=0.01):
24
+ super(ConvBlockRes, self).__init__()
25
+ self.conv = nn.Sequential(
26
+ nn.Conv2d(
27
+ in_channels=in_channels,
28
+ out_channels=out_channels,
29
+ kernel_size=(3, 3),
30
+ stride=(1, 1),
31
+ padding=(1, 1),
32
+ bias=False,
33
+ ),
34
+ nn.BatchNorm2d(out_channels, momentum=momentum),
35
+ nn.ReLU(),
36
+ nn.Conv2d(
37
+ in_channels=out_channels,
38
+ out_channels=out_channels,
39
+ kernel_size=(3, 3),
40
+ stride=(1, 1),
41
+ padding=(1, 1),
42
+ bias=False,
43
+ ),
44
+ nn.BatchNorm2d(out_channels, momentum=momentum),
45
+ nn.ReLU(),
46
+ )
47
+ if in_channels != out_channels:
48
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
49
+ self.is_shortcut = True
50
+ else:
51
+ self.is_shortcut = False
52
+
53
+ def forward(self, x):
54
+ if self.is_shortcut:
55
+ return self.conv(x) + self.shortcut(x)
56
+ else:
57
+ return self.conv(x) + x
58
+
59
+
60
+ class Encoder(nn.Module):
61
+ def __init__(
62
+ self,
63
+ in_channels,
64
+ in_size,
65
+ n_encoders,
66
+ kernel_size,
67
+ n_blocks,
68
+ out_channels=16,
69
+ momentum=0.01,
70
+ ):
71
+ super(Encoder, self).__init__()
72
+ self.n_encoders = n_encoders
73
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
74
+ self.layers = nn.ModuleList()
75
+ self.latent_channels = []
76
+ for i in range(self.n_encoders):
77
+ self.layers.append(
78
+ ResEncoderBlock(
79
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
80
+ )
81
+ )
82
+ self.latent_channels.append([out_channels, in_size])
83
+ in_channels = out_channels
84
+ out_channels *= 2
85
+ in_size //= 2
86
+ self.out_size = in_size
87
+ self.out_channel = out_channels
88
+
89
+ def forward(self, x):
90
+ concat_tensors = []
91
+ x = self.bn(x)
92
+ for i in range(self.n_encoders):
93
+ _, x = self.layers[i](x)
94
+ concat_tensors.append(_)
95
+ return x, concat_tensors
96
+
97
+
98
+ class ResEncoderBlock(nn.Module):
99
+ def __init__(
100
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
101
+ ):
102
+ super(ResEncoderBlock, self).__init__()
103
+ self.n_blocks = n_blocks
104
+ self.conv = nn.ModuleList()
105
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
106
+ for i in range(n_blocks - 1):
107
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
108
+ self.kernel_size = kernel_size
109
+ if self.kernel_size is not None:
110
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
111
+
112
+ def forward(self, x):
113
+ for i in range(self.n_blocks):
114
+ x = self.conv[i](x)
115
+ if self.kernel_size is not None:
116
+ return x, self.pool(x)
117
+ else:
118
+ return x
119
+
120
+
121
+ class Intermediate(nn.Module): #
122
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
123
+ super(Intermediate, self).__init__()
124
+ self.n_inters = n_inters
125
+ self.layers = nn.ModuleList()
126
+ self.layers.append(
127
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
128
+ )
129
+ for i in range(self.n_inters - 1):
130
+ self.layers.append(
131
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
132
+ )
133
+
134
+ def forward(self, x):
135
+ for i in range(self.n_inters):
136
+ x = self.layers[i](x)
137
+ return x
138
+
139
+
140
+ class ResDecoderBlock(nn.Module):
141
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
142
+ super(ResDecoderBlock, self).__init__()
143
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
144
+ self.n_blocks = n_blocks
145
+ self.conv1 = nn.Sequential(
146
+ nn.ConvTranspose2d(
147
+ in_channels=in_channels,
148
+ out_channels=out_channels,
149
+ kernel_size=(3, 3),
150
+ stride=stride,
151
+ padding=(1, 1),
152
+ output_padding=out_padding,
153
+ bias=False,
154
+ ),
155
+ nn.BatchNorm2d(out_channels, momentum=momentum),
156
+ nn.ReLU(),
157
+ )
158
+ self.conv2 = nn.ModuleList()
159
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
160
+ for i in range(n_blocks - 1):
161
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
162
+
163
+ def forward(self, x, concat_tensor):
164
+ x = self.conv1(x)
165
+ x = torch.cat((x, concat_tensor), dim=1)
166
+ for i in range(self.n_blocks):
167
+ x = self.conv2[i](x)
168
+ return x
169
+
170
+
171
+ class Decoder(nn.Module):
172
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
173
+ super(Decoder, self).__init__()
174
+ self.layers = nn.ModuleList()
175
+ self.n_decoders = n_decoders
176
+ for i in range(self.n_decoders):
177
+ out_channels = in_channels // 2
178
+ self.layers.append(
179
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
180
+ )
181
+ in_channels = out_channels
182
+
183
+ def forward(self, x, concat_tensors):
184
+ for i in range(self.n_decoders):
185
+ x = self.layers[i](x, concat_tensors[-1 - i])
186
+ return x
187
+
188
+
189
+ class DeepUnet(nn.Module):
190
+ def __init__(
191
+ self,
192
+ kernel_size,
193
+ n_blocks,
194
+ en_de_layers=5,
195
+ inter_layers=4,
196
+ in_channels=1,
197
+ en_out_channels=16,
198
+ ):
199
+ super(DeepUnet, self).__init__()
200
+ self.encoder = Encoder(
201
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
202
+ )
203
+ self.intermediate = Intermediate(
204
+ self.encoder.out_channel // 2,
205
+ self.encoder.out_channel,
206
+ inter_layers,
207
+ n_blocks,
208
+ )
209
+ self.decoder = Decoder(
210
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
211
+ )
212
+
213
+ def forward(self, x):
214
+ x, concat_tensors = self.encoder(x)
215
+ x = self.intermediate(x)
216
+ x = self.decoder(x, concat_tensors)
217
+ return x
218
+
219
+
220
+ class E2E(nn.Module):
221
+ def __init__(
222
+ self,
223
+ n_blocks,
224
+ n_gru,
225
+ kernel_size,
226
+ en_de_layers=5,
227
+ inter_layers=4,
228
+ in_channels=1,
229
+ en_out_channels=16,
230
+ ):
231
+ super(E2E, self).__init__()
232
+ self.unet = DeepUnet(
233
+ kernel_size,
234
+ n_blocks,
235
+ en_de_layers,
236
+ inter_layers,
237
+ in_channels,
238
+ en_out_channels,
239
+ )
240
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
241
+ if n_gru:
242
+ self.fc = nn.Sequential(
243
+ BiGRU(3 * 128, 256, n_gru),
244
+ nn.Linear(512, 360),
245
+ nn.Dropout(0.25),
246
+ nn.Sigmoid(),
247
+ )
248
+ else:
249
+ self.fc = nn.Sequential(
250
+ nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
251
+ )
252
+
253
+ def forward(self, mel):
254
+ mel = mel.transpose(-1, -2).unsqueeze(1)
255
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
256
+ x = self.fc(x)
257
+ return x
258
+
259
+
260
+ from librosa.filters import mel
261
+
262
+
263
+ class MelSpectrogram(torch.nn.Module):
264
+ def __init__(
265
+ self,
266
+ is_half,
267
+ n_mel_channels,
268
+ sampling_rate,
269
+ win_length,
270
+ hop_length,
271
+ n_fft=None,
272
+ mel_fmin=0,
273
+ mel_fmax=None,
274
+ clamp=1e-5,
275
+ ):
276
+ super().__init__()
277
+ n_fft = win_length if n_fft is None else n_fft
278
+ self.hann_window = {}
279
+ mel_basis = mel(
280
+ sr=sampling_rate,
281
+ n_fft=n_fft,
282
+ n_mels=n_mel_channels,
283
+ fmin=mel_fmin,
284
+ fmax=mel_fmax,
285
+ htk=True,
286
+ )
287
+ mel_basis = torch.from_numpy(mel_basis).float()
288
+ self.register_buffer("mel_basis", mel_basis)
289
+ self.n_fft = win_length if n_fft is None else n_fft
290
+ self.hop_length = hop_length
291
+ self.win_length = win_length
292
+ self.sampling_rate = sampling_rate
293
+ self.n_mel_channels = n_mel_channels
294
+ self.clamp = clamp
295
+ self.is_half = is_half
296
+
297
+ def forward(self, audio, keyshift=0, speed=1, center=True):
298
+ factor = 2 ** (keyshift / 12)
299
+ n_fft_new = int(np.round(self.n_fft * factor))
300
+ win_length_new = int(np.round(self.win_length * factor))
301
+ hop_length_new = int(np.round(self.hop_length * speed))
302
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
303
+ if keyshift_key not in self.hann_window:
304
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
305
+ audio.device
306
+ )
307
+ fft = torch.stft(
308
+ audio,
309
+ n_fft=n_fft_new,
310
+ hop_length=hop_length_new,
311
+ win_length=win_length_new,
312
+ window=self.hann_window[keyshift_key],
313
+ center=center,
314
+ return_complex=True,
315
+ )
316
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
317
+ if keyshift != 0:
318
+ size = self.n_fft // 2 + 1
319
+ resize = magnitude.size(1)
320
+ if resize < size:
321
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
322
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
323
+ mel_output = torch.matmul(self.mel_basis, magnitude)
324
+ if self.is_half == True:
325
+ mel_output = mel_output.half()
326
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
327
+ return log_mel_spec
328
+
329
+
330
+ class RMVPE:
331
+ def __init__(self, model_path, is_half, device=None):
332
+ self.resample_kernel = {}
333
+ model = E2E(4, 1, (2, 2))
334
+ ckpt = torch.load(model_path, map_location="cpu")
335
+ model.load_state_dict(ckpt)
336
+ model.eval()
337
+ if is_half == True:
338
+ model = model.half()
339
+ self.model = model
340
+ self.resample_kernel = {}
341
+ self.is_half = is_half
342
+ if device is None:
343
+ device = "cuda" if torch.cuda.is_available() else "cpu"
344
+ self.device = device
345
+ self.mel_extractor = MelSpectrogram(
346
+ is_half, 128, 16000, 1024, 160, None, 30, 8000
347
+ ).to(device)
348
+ self.model = self.model.to(device)
349
+ cents_mapping = 20 * np.arange(360) + 1997.3794084376191
350
+ self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
351
+
352
+ def mel2hidden(self, mel):
353
+ with torch.no_grad():
354
+ n_frames = mel.shape[-1]
355
+ mel = F.pad(
356
+ mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
357
+ )
358
+ hidden = self.model(mel)
359
+ return hidden[:, :n_frames]
360
+
361
+ def decode(self, hidden, thred=0.03):
362
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
363
+ f0 = 10 * (2 ** (cents_pred / 1200))
364
+ f0[f0 == 10] = 0
365
+ # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
366
+ return f0
367
+
368
+ def infer_from_audio(self, audio, thred=0.03):
369
+ audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
370
+ # torch.cuda.synchronize()
371
+ # t0=ttime()
372
+ mel = self.mel_extractor(audio, center=True)
373
+ # torch.cuda.synchronize()
374
+ # t1=ttime()
375
+ hidden = self.mel2hidden(mel)
376
+ # torch.cuda.synchronize()
377
+ # t2=ttime()
378
+ hidden = hidden.squeeze(0).cpu().numpy()
379
+ if self.is_half == True:
380
+ hidden = hidden.astype("float32")
381
+ f0 = self.decode(hidden, thred=thred)
382
+ # torch.cuda.synchronize()
383
+ # t3=ttime()
384
+ # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
385
+ return f0
386
+
387
+ def to_local_average_cents(self, salience, thred=0.05):
388
+ # t0 = ttime()
389
+ center = np.argmax(salience, axis=1) # 帧长#index
390
+ salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
391
+ # t1 = ttime()
392
+ center += 4
393
+ todo_salience = []
394
+ todo_cents_mapping = []
395
+ starts = center - 4
396
+ ends = center + 5
397
+ for idx in range(salience.shape[0]):
398
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
399
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
400
+ # t2 = ttime()
401
+ todo_salience = np.array(todo_salience) # 帧长,9
402
+ todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9
403
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
404
+ weight_sum = np.sum(todo_salience, 1) # 帧长
405
+ devided = product_sum / weight_sum # 帧长
406
+ # t3 = ttime()
407
+ maxx = np.max(salience, axis=1) # 帧长
408
+ devided[maxx <= thred] = 0
409
+ # t4 = ttime()
410
+ # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
411
+ return devided
412
+
413
+
414
+ # if __name__ == '__main__':
415
+ # audio, sampling_rate = sf.read("卢本伟语录~1.wav")
416
+ # if len(audio.shape) > 1:
417
+ # audio = librosa.to_mono(audio.transpose(1, 0))
418
+ # audio_bak = audio.copy()
419
+ # if sampling_rate != 16000:
420
+ # audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
421
+ # model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/test-RMVPE/weights/rmvpe_llc_half.pt"
422
+ # thred = 0.03 # 0.01
423
+ # device = 'cuda' if torch.cuda.is_available() else 'cpu'
424
+ # rmvpe = RMVPE(model_path,is_half=False, device=device)
425
+ # t0=ttime()
426
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
427
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
428
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
429
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
430
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
431
+ # t1=ttime()
432
+ # print(f0.shape,t1-t0)