Vocal-Isolator

Runtime error

App Files Files Community

Jarod Castillo commited on Aug 11, 2023

Commit

88b57c0

•

1 Parent(s): e6e9e8c

init

Browse files

Files changed (15) hide show

.gitignore +168 -0
.streamlit/config.toml +120 -0
README.md +70 -13
app.py +81 -0
pretrained_models/MDX_net/Kim_Vocal.onnx +3 -0
pretrained_models/MDX_net/model_data/model_data.json +340 -0
pretrained_models/MDX_net/model_data/model_name_mapper.json +22 -0
requirements.txt +8 -0
src/Sound_Feature_Extraction/short_time_fourier_transform.py +50 -0
src/constants.py +9 -0
src/infer.py +20 -0
src/loader.py +30 -0
src/models/MDX_net/kimvocal.py +73 -0
src/models/MDX_net/mdx_net.py +275 -0
src/models/Pitch_Feature_Extraction/rmvpe.py +432 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+### Jarod's Custom Template
+.idea
+.vscode
+datasets
+ignored
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,120 @@

+[server]
+# List of folders that should not be watched for changes. This
+# impacts both "Run on Save" and @st.cache.
+# Relative paths will be taken as relative to the current working directory.
+# Example: ['/home/user1/env', 'relative/path/to/folder']
+# Default: []
+folderWatchBlacklist = []
+# Change the type of file watcher used by Streamlit, or turn it off
+# completely.
+# Allowed values:
+# * "auto"     : Streamlit will attempt to use the watchdog module, and
+#                falls back to polling if watchdog is not available.
+# * "watchdog" : Force Streamlit to use the watchdog module.
+# * "poll"     : Force Streamlit to always use polling.
+# * "none"     : Streamlit will not watch files.
+# Default: "auto"
+fileWatcherType = "auto"
+# Symmetric key used to produce signed cookies. If deploying on multiple
+# replicas, this should be set to the same value across all replicas to ensure
+# they all share the same secret.
+# Default: randomly generated secret key.
+cookieSecret = "a-random-key-appears-here"
+# If false, will attempt to open a browser window on start.
+# Default: false unless (1) we are on a Linux box where DISPLAY is unset, or
+# (2) we are running in the Streamlit Atom plugin.
+# headless = false
+# Automatically rerun script when the file is modified on disk.
+# Default: false
+runOnSave = false
+# The address where the server will listen for client and browser
+# connections. Use this if you want to bind the server to a specific address.
+# If set, the server will only be accessible from this address, and not from
+# any aliases (like localhost).
+# Default: (unset)
+# address =
+# The port where the server will listen for browser connections.
+# Default: 8501
+port = 8501
+# The base path for the URL where Streamlit should be served from.
+# Default: ""
+baseUrlPath = ""
+# Enables support for Cross-Origin Resource Sharing (CORS) protection, for
+# added security.
+# Due to conflicts between CORS and XSRF, if `server.enableXsrfProtection` is
+# on and `server.enableCORS` is off at the same time, we will prioritize
+# `server.enableXsrfProtection`.
+# Default: true
+enableCORS = true
+# Enables support for Cross-Site Request Forgery (XSRF) protection, for added
+# security.
+# Due to conflicts between CORS and XSRF, if `server.enableXsrfProtection` is
+# on and `server.enableCORS` is off at the same time, we will prioritize
+# `server.enableXsrfProtection`.
+# Default: true
+enableXsrfProtection = true
+# Max size, in megabytes, for files uploaded with the file_uploader.
+# Default: 200
+maxUploadSize = 200
+# Max size, in megabytes, of messages that can be sent via the WebSocket
+# connection.
+# Default: 200
+maxMessageSize = 200
+# Enables support for websocket compression.
+# Default: false
+enableWebsocketCompression = false
+# Enable serving files from a `static` directory in the running app's
+# directory.
+# Default: false
+enableStaticServing = false
+# Server certificate file for connecting via HTTPS.
+# Must be set at the same time as "server.sslKeyFile".
+# ['DO NOT USE THIS OPTION IN A PRODUCTION ENVIRONMENT. It has not gone through
+# security audits or performance tests. For the production environment, we
+# recommend performing SSL termination by the load balancer or the reverse
+# proxy.']
+# sslCertFile =
+# Cryptographic key file for connecting via HTTPS.
+# Must be set at the same time as "server.sslCertFile".
+# ['DO NOT USE THIS OPTION IN A PRODUCTION ENVIRONMENT. It has not gone through
+# security audits or performance tests. For the production environment, we
+# recommend performing SSL termination by the load balancer or the reverse
+# proxy.']
+# sslKeyFile =

README.md CHANGED Viewed

@@ -1,13 +1,70 @@
----
-title: Sing For Me
-emoji: 🌍
-colorFrom: blue
-colorTo: indigo
-sdk: streamlit
-sdk_version: 1.25.0
-app_file: app.py
-pinned: false
-license: openrail
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Vocal Remover
+A web-based tool for removing vocals from audio files using deep learning.
+## Table of Contents
+- [Overview](#overview)
+- [Features](#features)
+- [Installation](#installation)
+- [Usage](#usage)
+- [Demo](#demo)
+- [Technologies Used](#technologies-used)
+- [Contributing](#contributing)
+- [License](#license)
+- [Useful Research Papers](#useful-research-papers)
+## Overview
+The Vocal Remover is a user-friendly web application that leverages deep learning models to remove vocals from audio files. It provides an easy and interactive way for users to upload their audio files and process them to obtain vocals-free versions.
+## Features
+- Upload audio files in various formats (WAV, MP3, OGG, FLAC).
+- Process audio files to remove vocals using a pre-trained deep learning model.
+- Display a progress bar during audio processing.
+- Play the original and processed audio files in the browser.
+- Downloadable WAV file
+- Clean and intuitive user interface.
+## Installation
+1. Clone this repository:
+   ```bash
+   git clone https://github.com/smotto/Sing-For-Me.git
+   cd Sing-For-Me
+2. Install the required Python packages:
+    ```bash
+    pip install -r requirements.txt
+## Usage
+1. Run the Streamlit app:
+    ```bash
+    streamlit run main.py
+2. Access the app in your web browser at http://localhost:8501.
+## Demo
+For a live demonstration, visit Demo Link.
+## Technologies Used
+* Python
+* Streamlit
+* PyTorch
+* Soundfile and Librosa
+## Contributing
+Contributions are welcome! If you have suggestions, bug reports, or feature requests, please open an issue or submit a pull request.
+## License
+This project is licensed under the Apache 2.0 License.
+## Useful Research Papers
+- [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597)
+- [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499)
+- [Wave-U-Net: A Multi-Scale Neural Network for End-to-End Audio Source Separation](https://arxiv.org/abs/1806.03185)
+- [KUIELab-MDX-Net: A Two-Stream Neural Network for Music Demixing](https://arxiv.org/abs/2111.12203)

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Standard Library
+import os
+# Third-Party
+import streamlit as st
+# Local
+from src.models.MDX_net.kimvocal import KimVocal
+from src.loader import Loader
+from src.models.MDX_net.mdx_net import Conv_TDF_net_trimm
+# Constants
+from src.constants import ONNX_MODEL_PATH
+INPUT_FOLDER = "./datasets/input"
+OUTPUT_FOLDER = "./datasets/output"
+def main():
+    # Set page configuration and theming
+    st.set_page_config(
+        page_title="Jarod's Vocal Remover",
+        page_icon="🎵",
+    )
+    st.title("Vocal Remover")
+    # Upload WAV file
+    uploaded_file = st.file_uploader(
+        "Upload an Audio File (WAV, MP3, OGG, FLAC)",
+        type=["wav", "mp3", "ogg", "flac"],
+        key="file_uploader",
+    )
+    if uploaded_file is not None:
+        uploaded_file_ext = uploaded_file.name.lower().split(".")[-1]
+        # Process the uploaded audio
+        st.subheader("Audio Processing")
+        st.write("Processing the uploaded audio file...")
+        # Display a progress bar while processing
+        progress_bar = st.progress(0)
+        progress_text = st.empty()
+        loader = Loader(INPUT_FOLDER, OUTPUT_FOLDER)
+        music_tensor, samplerate = loader.prepare_uploaded_file(
+            uploaded_file=uploaded_file
+        )
+        model_raw_python = Conv_TDF_net_trimm(
+            model_path=ONNX_MODEL_PATH,
+            use_onnx=True,
+            target_name="vocals",
+            L=11,
+            l=3,
+            g=48,
+            bn=8,
+            bias=False,
+            dim_f=11,
+            dim_t=8,
+        )
+        kimvocal = KimVocal()
+        vocals_tensor = kimvocal.demix_vocals(
+            music_tensor=music_tensor,
+            sample_rate=samplerate,
+            model=model_raw_python,
+            streamlit_progressbar=progress_bar,
+        )
+        vocals_array = vocals_tensor.numpy()
+        # Update progress
+        progress_bar.progress(100)
+        progress_text.text("Audio processing complete!")
+        # Display processed audio
+        st.subheader("Processed Audio")
+        st.audio(vocals_array, format="audio/wav", sample_rate=samplerate)
+if __name__ == "__main__":
+    main()

pretrained_models/MDX_net/Kim_Vocal.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce74ef3b6a6024ce44211a07be9cf8bc6d87728cc852a68ab34eb8e58cde9c8b
+size 66759214

pretrained_models/MDX_net/model_data/model_data.json ADDED Viewed

	@@ -0,0 +1,340 @@

+{
+    "0ddfc0eb5792638ad5dc27850236c246": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "26d308f91f3423a67dc69a6d12a8793d": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 8192,
+        "primary_stem": "Other"
+    },
+    "2cdd429caac38f0194b133884160f2c6": {
+        "compensate": 1.045,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "2f5501189a2f6db6349916fabe8c90de": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "398580b6d5d973af3120df54cee6759d": {
+        "compensate": 1.75,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "488b3e6f8bd3717d9d7c428476be2d75": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "4910e7827f335048bdac11fa967772f9": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 7,
+        "mdx_n_fft_scale_set": 4096,
+        "primary_stem": "Drums"
+    },
+    "53c4baf4d12c3e6c3831bb8f5b532b93": {
+        "compensate": 1.043,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "5d343409ef0df48c7d78cce9f0106781": {
+        "compensate": 1.075,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "5f6483271e1efb9bfb59e4a3e6d4d098": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "65ab5919372a128e4167f5e01a8fda85": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 8192,
+        "primary_stem": "Other"
+    },
+    "6703e39f36f18aa7855ee1047765621d": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 16384,
+        "primary_stem": "Bass"
+    },
+    "6b31de20e84392859a3d09d43f089515": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "867595e9de46f6ab699008295df62798": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "a3cd63058945e777505c01d2507daf37": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "b33d9b3950b6cbf5fe90a32608924700": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "c3b29bdce8c4fa17ec609e16220330ab": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 16384,
+        "primary_stem": "Bass"
+    },
+    "ceed671467c1f64ebdfac8a2490d0d52": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "d2a1376f310e4f7fa37fb9b5774eb701": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "d7bff498db9324db933d913388cba6be": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "d94058f8c7f1fae4164868ae8ae66b20": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "dc41ede5961d50f277eb846db17f5319": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 4096,
+        "primary_stem": "Drums"
+    },
+    "e5572e58abf111f80d8241d2e44e7fa4": {
+        "compensate": 1.028,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "e7324c873b1f615c35c1967f912db92a": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "1c56ec0224f1d559c42fd6fd2a67b154": {
+        "compensate": 1.025,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 5120,
+        "primary_stem": "Instrumental"
+    },
+    "f2df6d6863d8f435436d8b561594ff49": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "b06327a00d5e5fbc7d96e1781bbdb596": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "94ff780b977d3ca07c7a343dab2e25dd": {
+        "compensate": 1.039,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "73492b58195c3b52d34590d5474452f6": {
+        "compensate": 1.043,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "970b3f9492014d18fefeedfe4773cb42": {
+        "compensate": 1.009,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "1d64a6d2c30f709b8c9b4ce1366d96ee": {
+        "compensate": 1.065,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 5120,
+        "primary_stem": "Instrumental"
+    },
+    "203f2a3955221b64df85a41af87cf8f0": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "291c2049608edb52648b96e27eb80e95": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "ead8d05dab12ec571d67549b3aab03fc": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "cc63408db3d80b4d85b0287d1d7c9632": {
+        "compensate": 1.033,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "cd5b2989ad863f116c855db1dfe24e39": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Other"
+    },
+    "55657dd70583b0fedfba5f67df11d711": {
+        "compensate": 1.022,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "b6bccda408a436db8500083ef3491e8b": {
+        "compensate": 1.02,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "8a88db95c7fb5dbe6a095ff2ffb428b1": {
+        "compensate": 1.026,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 5120,
+        "primary_stem": "Instrumental"
+    },
+    "b78da4afc6512f98e4756f5977f5c6b9": {
+        "compensate": 1.021,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "77d07b2667ddf05b9e3175941b4454a0": {
+        "compensate": 1.021,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "2154254ee89b2945b97a7efed6e88820": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "063aadd735d58150722926dcbf5852a9": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "fe96801369f6a148df2720f5ced88c19": {
+        "config_yaml": "model3.yaml"
+    },
+    "02e8b226f85fb566e5db894b9931c640": {
+        "config_yaml": "model2.yaml"
+    },
+    "e3de6d861635ab9c1d766149edd680d6": {
+        "config_yaml": "model1.yaml"
+    },
+    "3f2936c554ab73ce2e396d54636bd373": {
+        "config_yaml": "modelB.yaml"
+    },
+    "890d0f6f82d7574bca741a9e8bcb8168": {
+        "config_yaml": "modelB.yaml"
+    },
+    "63a3cb8c37c474681049be4ad1ba8815": {
+        "config_yaml": "modelB.yaml"
+    },
+    "a7fc5d719743c7fd6b61bd2b4d48b9f0": {
+        "config_yaml": "modelA.yaml"
+    },
+    "3567f3dee6e77bf366fcb1c7b8bc3745": {
+        "config_yaml": "modelA.yaml"
+    },
+    "a28f4d717bd0d34cd2ff7a3b0a3d065e": {
+        "config_yaml": "modelA.yaml"
+    },
+    "c9971a18da20911822593dc81caa8be9": {
+        "config_yaml": "sndfx.yaml"
+    },
+    "57d94d5ed705460d21c75a5ac829a605": {
+        "config_yaml": "sndfx.yaml"
+    },
+    "e7a25f8764f25a52c1b96c4946e66ba2": {
+        "config_yaml": "sndfx.yaml"
+    },
+    "104081d24e37217086ce5fde09147ee1": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "1e6165b601539f38d0a9330f3facffeb": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "fe0108464ce0d8271be5ab810891bd7c": {
+        "config_yaml": "model_2_stem_full_band.yaml"
+    }
+}

pretrained_models/MDX_net/model_data/model_name_mapper.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "UVR_MDXNET_1_9703": "UVR-MDX-NET 1",
+    "UVR_MDXNET_2_9682": "UVR-MDX-NET 2",
+    "UVR_MDXNET_3_9662": "UVR-MDX-NET 3",
+    "UVR_MDXNET_KARA": "UVR-MDX-NET Karaoke",
+    "UVR_MDXNET_Main": "UVR-MDX-NET Main",
+    "UVR-MDX-NET-Inst_1": "UVR-MDX-NET Inst 1",
+    "UVR-MDX-NET-Inst_2": "UVR-MDX-NET Inst 2",
+    "UVR-MDX-NET-Inst_3": "UVR-MDX-NET Inst 3",
+    "UVR-MDX-NET-Inst_4": "UVR-MDX-NET Inst 4",
+    "UVR-MDX-NET-Inst_Main": "UVR-MDX-NET Inst Main",
+    "UVR-MDX-NET-Inst_Main_2": "UVR-MDX-NET Inst Main 2",
+    "UVR-MDX-NET-Inst_HQ_1": "UVR-MDX-NET Inst HQ 1",
+    "UVR-MDX-NET-Inst_HQ_2": "UVR-MDX-NET Inst HQ 2",
+    "UVR-MDX-NET-Inst_HQ_3": "UVR-MDX-NET Inst HQ 3",
+    "UVR_MDXNET_KARA_2": "UVR-MDX-NET Karaoke 2",
+    "UVR-MDX-NET-Voc_FT": "UVR-MDX-NET Voc FT",
+    "Kim_Vocal_1": "Kim Vocal 1",
+    "Kim_Vocal_2": "Kim Vocal 2",
+    "Kim_Inst": "Kim Inst",
+    "Reverb_HQ_By_FoxJoy": "Reverb HQ"
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pip~=23.2.1
+torch~=2.0.1
+onnxruntime~=1.15.1
+librosa~=0.10.0.post2
+soundfile~=0.12.1
+numpy~=1.24.4
+scipy~=1.11.1
+streamlit~=1.25.0

src/Sound_Feature_Extraction/short_time_fourier_transform.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+class STFT:
+    def __init__(self, n_fft, hop_length, dim_f):
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.window = torch.hann_window(window_length=n_fft, periodic=True)
+        self.dim_f = dim_f
+    def __call__(self, x):
+        window = self.window.to(x.device)
+        batch_dims = x.shape[:-2]
+        c, t = x.shape[-2:]
+        x = x.reshape([-1, t])
+        x = torch.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            window=window,
+            center=True,
+            return_complex=True,
+        )
+        x = torch.view_as_real(x)
+        x = x.permute([0, 3, 1, 2])
+        x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape(
+            [*batch_dims, c * 2, -1, x.shape[-1]]
+        )
+        return x[..., : self.dim_f, :]
+    def inverse(self, x):
+        window = self.window.to(x.device)
+        batch_dims = x.shape[:-3]
+        c, f, t = x.shape[-3:]
+        n = self.n_fft // 2 + 1
+        f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device)
+        x = torch.cat([x, f_pad], -2)
+        x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t])
+        x = x.permute([0, 2, 3, 1])
+        x = x.contiguous()
+        t_complex = torch.view_as_complex(x)
+        x = torch.istft(
+            t_complex,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            window=window,
+            center=True,
+        )
+        x = x.reshape([*batch_dims, 2, -1])
+        return x

src/constants.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Third-party
+import torch
+# Global Variables
+COMPUTATION_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+EXECUTION_PROVIDER_LIST = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+ONNX_MODEL_PATH = "./models/MDX_Net_Models/Kim_Vocal.onnx"
+INPUT_FOLDER = "./datasets/input"
+OUTPUT_FOLDER = "./datasets/output"

src/infer.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Standard Library Imports
+import os
+import subprocess
+# Third Party Imports
+import torch
+import onnxruntime as ort
+# Local Imports
+from models.MDX_net.mdx_net import Conv_TDF_net_trimm
+from loader import Loader
+vocal_path = r"./datasets/output/vocals.wav"
+# Global Variables
+COMPUTATION_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def main():
+    print(COMPUTATION_DEVICE)

src/loader.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Standard Library
+import os
+# Explicit Typing
+from typing import Tuple
+from numpy import ndarray
+# Third-party
+import librosa
+import torch
+class Loader:
+    """Loading sound files into a usable format for pytorch"""
+    def __init__(self, INPUT_FOLDER, OUTPUT_FOLDER):
+        self.input = INPUT_FOLDER
+        self.output = OUTPUT_FOLDER
+    def load_wav(self, name) -> Tuple[ndarray, int]:
+        music_array, samplerate = librosa.load(
+            os.path.join(self.input, name + ".wav"), mono=False, sr=44100
+        )
+        return music_array, samplerate
+    def prepare_uploaded_file(self, uploaded_file) -> Tuple[torch.Tensor, int]:
+        music_array, samplerate = librosa.load(uploaded_file, mono=False, sr=44100)
+        music_tensor = torch.tensor(music_array, dtype=torch.float32)
+        return music_tensor, samplerate

src/models/MDX_net/kimvocal.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Standard Library Imports
+# Third Party Imports
+import torch
+import onnxruntime as ort
+# Local Imports
+from src.models.MDX_net.mdx_net import Conv_TDF_net_trimm
+from src.loader import Loader
+# Global Variables
+from src.constants import EXECUTION_PROVIDER_LIST, COMPUTATION_DEVICE, ONNX_MODEL_PATH
+class KimVocal:
+    """
+    TODO: Put something here for flexibility purposes (model types).
+    """
+    def __init__(self):
+        pass
+    def demix_vocals(self, music_tensor, sample_rate, model, streamlit_progressbar):
+        """
+        Removing vocals using a ONNX model.
+        Args:
+            music_tensor (torch.Tensor): Input tensor.
+            model (torch.nn): Model used for inferring.
+        Returns:
+            torch.Tensor: Output tensor after passing through the network.
+        """
+        number_of_samples = music_tensor.shape[1]
+        overlap = model.overlap
+        # Calculate chunk_size and gen_size based on the sample rate
+        chunk_size = model.chunk_size
+        gen_size = chunk_size - 2 * overlap
+        pad_size = gen_size - number_of_samples % gen_size
+        mix_padded = torch.cat(
+            [torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size + overlap)],
+            1,
+        )
+        # Start running the session for the model
+        ort_session = ort.InferenceSession(ONNX_MODEL_PATH, providers=EXECUTION_PROVIDER_LIST)
+        # TODO: any way to optimize against silence? I think that's what skips are for, gotta double check.
+        # process one chunk at a time (batch_size=1)
+        demixed_chunks = []
+        i = 0
+        while i < number_of_samples + pad_size:
+            # Progress Bar
+            streamlit_progressbar.progress(i / (number_of_samples + pad_size))
+            # Computation
+            chunk = mix_padded[:, i : i + chunk_size]
+            x = model.stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE))
+            with torch.no_grad():
+                x = torch.tensor(ort_session.run(None, {"input": x.cpu().numpy()})[0])
+            x = model.stft.inverse(x).squeeze(0)
+            x = x[..., overlap:-overlap]
+            demixed_chunks.append(x)
+            i += gen_size
+        vocals_output = torch.cat(demixed_chunks, -1)[..., :-pad_size].cpu()
+        return vocals_output
+if __name__ == "__main__":
+    kimvocal = KimVocal()
+    kimvocal.main()

src/models/MDX_net/mdx_net.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Third-party
+import torch
+import torch.nn as nn
+# Local
+from src.Sound_Feature_Extraction.short_time_fourier_transform import STFT
+COMPUTATION_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+class Conv_TDF(nn.Module):
+    """
+    Convolutional Time-Domain Filter (TDF) Module.
+    Args:
+        c (int): The number of input and output channels for the convolutional layers.
+        l (int): The number of convolutional layers within the module.
+        f (int): The number of features (or units) in the time-domain filter.
+        k (int): The size of the convolutional kernels (filters).
+        bn (int or None): Batch normalization factor (controls TDF behavior). If None, TDF is not used.
+        bias (bool): A boolean flag indicating whether bias terms are included in the linear layers.
+    Attributes:
+        use_tdf (bool): Flag indicating whether TDF is used.
+    Methods:
+        forward(x): Forward pass through the TDF module.
+    """
+    def __init__(self, c, l, f, k, bn, bias=True):
+        super(Conv_TDF, self).__init__()
+        # Determine whether to use TDF (Time-Domain Filter)
+        self.use_tdf = bn is not None
+        # Define a list of convolutional layers within the module
+        self.H = nn.ModuleList()
+        for i in range(l):
+            self.H.append(
+                nn.Sequential(
+                    nn.Conv2d(
+                        in_channels=c,
+                        out_channels=c,
+                        kernel_size=k,
+                        stride=1,
+                        padding=k // 2,
+                    ),
+                    nn.GroupNorm(2, c),
+                    nn.ReLU(),
+                )
+            )
+        # Define the Time-Domain Filter (TDF) layers if enabled
+        if self.use_tdf:
+            if bn == 0:
+                self.tdf = nn.Sequential(
+                    nn.Linear(f, f, bias=bias), nn.GroupNorm(2, c), nn.ReLU()
+                )
+            else:
+                self.tdf = nn.Sequential(
+                    nn.Linear(f, f // bn, bias=bias),
+                    nn.GroupNorm(2, c),
+                    nn.ReLU(),
+                    nn.Linear(f // bn, f, bias=bias),
+                    nn.GroupNorm(2, c),
+                    nn.ReLU(),
+                )
+    def forward(self, x):
+        # Apply the convolutional layers sequentially
+        for h in self.H:
+            x = h(x)
+        # Apply the Time-Domain Filter (TDF) if enabled, and add the result to the orignal input
+        return x + self.tdf(x) if self.use_tdf else x
+class Conv_TDF_net_trimm(nn.Module):
+    """
+    Convolutional Time-Domain Filter (TDF) Network with Trimming.
+    Args:
+        L (int): This parameter controls the number of down-sampling (DS) blocks in the network.
+                 It's divided by 2 to determine how many DS blocks should be created.
+        l (int): This parameter represents the number of convolutional layers (or filters) within each dense (fully connected) block.
+        g (int): This parameter specifies the number of output channels for the first convolutional layer and is also used to determine the number of channels for subsequent layers in the network.
+        dim_f (int): This parameter represents the number of frequency bins (spectrogram columns) in the input audio data.
+        dim_t (int): This parameter represents the number of time frames (spectrogram rows) in the input audio data.
+        k (int): This parameter specifies the size of convolutional kernels (filters) used in the network's convolutional layers.
+        bn (int or None): This parameter controls whether batch normalization is used in the network.
+                         If it's None, batch normalization may or may not be used based on other conditions in the code.
+        bias (bool): This parameter is a boolean flag that controls whether bias terms are included in the convolutional layers.
+        overlap (int): This parameter specifies the amount of overlap between consecutive chunks of audio data during processing.
+    Attributes:
+        n (int): The calculated number of down-sampling (DS) blocks.
+        dim_f (int): The number of frequency bins (spectrogram columns) in the input audio data.
+        dim_t (int): The number of time frames (spectrogram rows) in the input audio data.
+        n_fft (int): The size of the Fast Fourier Transform (FFT) window.
+        hop (int): The hop size used in the STFT calculations.
+        n_bins (int): The number of bins in the frequency domain.
+        chunk_size (int): The size of each chunk of audio data.
+        target_name (str): The name of the target instrument being separated.
+        overlap (int): The amount of overlap between consecutive chunks of audio data during processing.
+    Methods:
+        forward(x): Forward pass through the Conv_TDF_net_trimm network.
+    """
+    def __init__(
+        self,
+        model_path,
+        use_onnx,
+        target_name,
+        L,
+        l,
+        g,
+        dim_f,
+        dim_t,
+        k=3,
+        hop=1024,
+        bn=None,
+        bias=True,
+        overlap=1500,
+    ):
+        super(Conv_TDF_net_trimm, self).__init__()
+        # Dictionary specifying the scale for the number of FFT bins for different target names
+        n_fft_scale = {"vocals": 3, "*": 2}
+        # Number of input and output channels for the initial and final convolutional layers
+        out_c = in_c = 4
+        # Number of down-sampling (DS) blocks
+        self.n = L // 2
+        # Dimensions of the frequency and time axes of the input data
+        self.dim_f = 3072
+        self.dim_t = 256
+        # Number of FFT bins (frequencies) and hop size for the Short-Time Fourier Transform (STFT)
+        self.n_fft = 7680
+        self.hop = hop
+        self.n_bins = self.n_fft // 2 + 1
+        # Chunk size used for processing
+        self.chunk_size = hop * (self.dim_t - 1)
+        # Target name for the model
+        self.target_name = target_name
+        # Overlap between consecutive chunks of audio data during processing
+        self.overlap = overlap
+        # STFT module for audio processing
+        self.stft = STFT(self.n_fft, self.hop, self.dim_f)
+        # Check if ONNX representation of the model should be used
+        if not use_onnx:
+            # First convolutional layer
+            self.first_conv = nn.Sequential(
+                nn.Conv2d(in_channels=in_c, out_channels=g, kernel_size=1, stride=1),
+                nn.BatchNorm2d(g),
+                nn.ReLU(),
+            )
+            # Initialize variables for dense (fully connected) blocks and downsampling (DS) blocks
+            f = self.dim_f
+            c = g
+            self.ds_dense = nn.ModuleList()
+            self.ds = nn.ModuleList()
+            # Loop through down-sampling (DS) blocks
+            for i in range(self.n):
+                # Create dense (fully connected) block for down-sampling
+                self.ds_dense.append(Conv_TDF(c, l, f, k, bn, bias=bias))
+                # Create down-sampling (DS) block
+                scale = (2, 2)
+                self.ds.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            in_channels=c,
+                            out_channels=c + g,
+                            kernel_size=scale,
+                            stride=scale,
+                        ),
+                        nn.BatchNorm2d(c + g),
+                        nn.ReLU(),
+                    )
+                )
+                f = f // 2
+                c += g
+            # Middle dense (fully connected block)
+            self.mid_dense = Conv_TDF(c, l, f, k, bn, bias=bias)
+            # If batch normalization is not specified and mid_tdf is True, use Conv_TDF with bn=0 and bias=False
+            if bn is None and mid_tdf:
+                self.mid_dense = Conv_TDF(c, l, f, k, bn=0, bias=False)
+            # Initialize variables for up-sampling (US) blocks
+            self.us_dense = nn.ModuleList()
+            self.us = nn.ModuleList()
+            # Loop through up-sampling (US) blocks
+            for i in range(self.n):
+                scale = (2, 2)
+                # Create up-sampling (US) block
+                self.us.append(
+                    nn.Sequential(
+                        nn.ConvTranspose2d(
+                            in_channels=c,
+                            out_channels=c - g,
+                            kernel_size=scale,
+                            stride=scale,
+                        ),
+                        nn.BatchNorm2d(c - g),
+                        nn.ReLU(),
+                    )
+                )
+                f = f * 2
+                c -= g
+                # Create dense (fully connected) block for up-sampling
+                self.us_dense.append(Conv_TDF(c, l, f, k, bn, bias=bias))
+            # Final convolutional layer
+            self.final_conv = nn.Sequential(
+                nn.Conv2d(in_channels=c, out_channels=out_c, kernel_size=1, stride=1),
+            )
+            try:
+                # Load model state from a file
+                self.load_state_dict(
+                    torch.load(
+                        f"{model_path}/{target_name}.pt",
+                        map_location=COMPUTATION_DEVICE,
+                    )
+                )
+                print(f"Loading model ({target_name})")
+            except FileNotFoundError:
+                print(f"Random init ({target_name})")
+    def forward(self, x):
+        """
+        Forward pass through the Conv_TDF_net_trimm network.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Output tensor after passing through the network.
+        """
+        x = self.first_conv(x)
+        x = x.transpose(-1, -2)
+        ds_outputs = []
+        for i in range(self.n):
+            x = self.ds_dense[i](x)
+            ds_outputs.append(x)
+            x = self.ds[i](x)
+        x = self.mid_dense(x)
+        for i in range(self.n):
+            x = self.us[i](x)
+            x *= ds_outputs[-i - 1]
+            x = self.us_dense[i](x)
+        x = x.transpose(-1, -2)
+        x = self.final_conv(x)
+        return x

src/models/Pitch_Feature_Extraction/rmvpe.py ADDED Viewed

	@@ -0,0 +1,432 @@

+import sys, torch, numpy as np, traceback, pdb
+import torch.nn as nn
+from time import time as ttime
+import torch.nn.functional as F
+class BiGRU(nn.Module):
+    def __init__(self, input_features, hidden_features, num_layers):
+        super(BiGRU, self).__init__()
+        self.gru = nn.GRU(
+            input_features,
+            hidden_features,
+            num_layers=num_layers,
+            batch_first=True,
+            bidirectional=True,
+        )
+    def forward(self, x):
+        return self.gru(x)[0]
+class ConvBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, momentum=0.01):
+        super(ConvBlockRes, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+            nn.Conv2d(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+    def forward(self, x):
+        if self.is_shortcut:
+            return self.conv(x) + self.shortcut(x)
+        else:
+            return self.conv(x) + x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        in_size,
+        n_encoders,
+        kernel_size,
+        n_blocks,
+        out_channels=16,
+        momentum=0.01,
+    ):
+        super(Encoder, self).__init__()
+        self.n_encoders = n_encoders
+        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
+        self.layers = nn.ModuleList()
+        self.latent_channels = []
+        for i in range(self.n_encoders):
+            self.layers.append(
+                ResEncoderBlock(
+                    in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
+                )
+            )
+            self.latent_channels.append([out_channels, in_size])
+            in_channels = out_channels
+            out_channels *= 2
+            in_size //= 2
+        self.out_size = in_size
+        self.out_channel = out_channels
+    def forward(self, x):
+        concat_tensors = []
+        x = self.bn(x)
+        for i in range(self.n_encoders):
+            _, x = self.layers[i](x)
+            concat_tensors.append(_)
+        return x, concat_tensors
+class ResEncoderBlock(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
+    ):
+        super(ResEncoderBlock, self).__init__()
+        self.n_blocks = n_blocks
+        self.conv = nn.ModuleList()
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+        self.kernel_size = kernel_size
+        if self.kernel_size is not None:
+            self.pool = nn.AvgPool2d(kernel_size=kernel_size)
+    def forward(self, x):
+        for i in range(self.n_blocks):
+            x = self.conv[i](x)
+        if self.kernel_size is not None:
+            return x, self.pool(x)
+        else:
+            return x
+class Intermediate(nn.Module):  #
+    def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
+        super(Intermediate, self).__init__()
+        self.n_inters = n_inters
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
+        )
+        for i in range(self.n_inters - 1):
+            self.layers.append(
+                ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
+            )
+    def forward(self, x):
+        for i in range(self.n_inters):
+            x = self.layers[i](x)
+        return x
+class ResDecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
+        super(ResDecoderBlock, self).__init__()
+        out_padding = (0, 1) if stride == (1, 2) else (1, 1)
+        self.n_blocks = n_blocks
+        self.conv1 = nn.Sequential(
+            nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=stride,
+                padding=(1, 1),
+                output_padding=out_padding,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        self.conv2 = nn.ModuleList()
+        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x, concat_tensor):
+        x = self.conv1(x)
+        x = torch.cat((x, concat_tensor), dim=1)
+        for i in range(self.n_blocks):
+            x = self.conv2[i](x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList()
+        self.n_decoders = n_decoders
+        for i in range(self.n_decoders):
+            out_channels = in_channels // 2
+            self.layers.append(
+                ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
+            )
+            in_channels = out_channels
+    def forward(self, x, concat_tensors):
+        for i in range(self.n_decoders):
+            x = self.layers[i](x, concat_tensors[-1 - i])
+        return x
+class DeepUnet(nn.Module):
+    def __init__(
+        self,
+        kernel_size,
+        n_blocks,
+        en_de_layers=5,
+        inter_layers=4,
+        in_channels=1,
+        en_out_channels=16,
+    ):
+        super(DeepUnet, self).__init__()
+        self.encoder = Encoder(
+            in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
+        )
+        self.intermediate = Intermediate(
+            self.encoder.out_channel // 2,
+            self.encoder.out_channel,
+            inter_layers,
+            n_blocks,
+        )
+        self.decoder = Decoder(
+            self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
+        )
+    def forward(self, x):
+        x, concat_tensors = self.encoder(x)
+        x = self.intermediate(x)
+        x = self.decoder(x, concat_tensors)
+        return x
+class E2E(nn.Module):
+    def __init__(
+        self,
+        n_blocks,
+        n_gru,
+        kernel_size,
+        en_de_layers=5,
+        inter_layers=4,
+        in_channels=1,
+        en_out_channels=16,
+    ):
+        super(E2E, self).__init__()
+        self.unet = DeepUnet(
+            kernel_size,
+            n_blocks,
+            en_de_layers,
+            inter_layers,
+            in_channels,
+            en_out_channels,
+        )
+        self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
+        if n_gru:
+            self.fc = nn.Sequential(
+                BiGRU(3 * 128, 256, n_gru),
+                nn.Linear(512, 360),
+                nn.Dropout(0.25),
+                nn.Sigmoid(),
+            )
+        else:
+            self.fc = nn.Sequential(
+                nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
+            )
+    def forward(self, mel):
+        mel = mel.transpose(-1, -2).unsqueeze(1)
+        x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
+        x = self.fc(x)
+        return x
+from librosa.filters import mel
+class MelSpectrogram(torch.nn.Module):
+    def __init__(
+        self,
+        is_half,
+        n_mel_channels,
+        sampling_rate,
+        win_length,
+        hop_length,
+        n_fft=None,
+        mel_fmin=0,
+        mel_fmax=None,
+        clamp=1e-5,
+    ):
+        super().__init__()
+        n_fft = win_length if n_fft is None else n_fft
+        self.hann_window = {}
+        mel_basis = mel(
+            sr=sampling_rate,
+            n_fft=n_fft,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
+            htk=True,
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+        self.n_fft = win_length if n_fft is None else n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sampling_rate = sampling_rate
+        self.n_mel_channels = n_mel_channels
+        self.clamp = clamp
+        self.is_half = is_half
+    def forward(self, audio, keyshift=0, speed=1, center=True):
+        factor = 2 ** (keyshift / 12)
+        n_fft_new = int(np.round(self.n_fft * factor))
+        win_length_new = int(np.round(self.win_length * factor))
+        hop_length_new = int(np.round(self.hop_length * speed))
+        keyshift_key = str(keyshift) + "_" + str(audio.device)
+        if keyshift_key not in self.hann_window:
+            self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
+                audio.device
+            )
+        fft = torch.stft(
+            audio,
+            n_fft=n_fft_new,
+            hop_length=hop_length_new,
+            win_length=win_length_new,
+            window=self.hann_window[keyshift_key],
+            center=center,
+            return_complex=True,
+        )
+        magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
+        if keyshift != 0:
+            size = self.n_fft // 2 + 1
+            resize = magnitude.size(1)
+            if resize < size:
+                magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
+            magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
+        mel_output = torch.matmul(self.mel_basis, magnitude)
+        if self.is_half == True:
+            mel_output = mel_output.half()
+        log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
+        return log_mel_spec
+class RMVPE:
+    def __init__(self, model_path, is_half, device=None):
+        self.resample_kernel = {}
+        model = E2E(4, 1, (2, 2))
+        ckpt = torch.load(model_path, map_location="cpu")
+        model.load_state_dict(ckpt)
+        model.eval()
+        if is_half == True:
+            model = model.half()
+        self.model = model
+        self.resample_kernel = {}
+        self.is_half = is_half
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.mel_extractor = MelSpectrogram(
+            is_half, 128, 16000, 1024, 160, None, 30, 8000
+        ).to(device)
+        self.model = self.model.to(device)
+        cents_mapping = 20 * np.arange(360) + 1997.3794084376191
+        self.cents_mapping = np.pad(cents_mapping, (4, 4))  # 368
+    def mel2hidden(self, mel):
+        with torch.no_grad():
+            n_frames = mel.shape[-1]
+            mel = F.pad(
+                mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
+            )
+            hidden = self.model(mel)
+            return hidden[:, :n_frames]
+    def decode(self, hidden, thred=0.03):
+        cents_pred = self.to_local_average_cents(hidden, thred=thred)
+        f0 = 10 * (2 ** (cents_pred / 1200))
+        f0[f0 == 10] = 0
+        # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
+        return f0
+    def infer_from_audio(self, audio, thred=0.03):
+        audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
+        # torch.cuda.synchronize()
+        # t0=ttime()
+        mel = self.mel_extractor(audio, center=True)
+        # torch.cuda.synchronize()
+        # t1=ttime()
+        hidden = self.mel2hidden(mel)
+        # torch.cuda.synchronize()
+        # t2=ttime()
+        hidden = hidden.squeeze(0).cpu().numpy()
+        if self.is_half == True:
+            hidden = hidden.astype("float32")
+        f0 = self.decode(hidden, thred=thred)
+        # torch.cuda.synchronize()
+        # t3=ttime()
+        # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
+        return f0
+    def to_local_average_cents(self, salience, thred=0.05):
+        # t0 = ttime()
+        center = np.argmax(salience, axis=1)  # 帧长#index
+        salience = np.pad(salience, ((0, 0), (4, 4)))  # 帧长,368
+        # t1 = ttime()
+        center += 4
+        todo_salience = []
+        todo_cents_mapping = []
+        starts = center - 4
+        ends = center + 5
+        for idx in range(salience.shape[0]):
+            todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
+            todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
+        # t2 = ttime()
+        todo_salience = np.array(todo_salience)  # 帧长，9
+        todo_cents_mapping = np.array(todo_cents_mapping)  # 帧长，9
+        product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
+        weight_sum = np.sum(todo_salience, 1)  # 帧长
+        devided = product_sum / weight_sum  # 帧长
+        # t3 = ttime()
+        maxx = np.max(salience, axis=1)  # 帧长
+        devided[maxx <= thred] = 0
+        # t4 = ttime()
+        # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
+        return devided
+# if __name__ == '__main__':
+#     audio, sampling_rate = sf.read("卢本伟语录~1.wav")
+#     if len(audio.shape) > 1:
+#         audio = librosa.to_mono(audio.transpose(1, 0))
+#     audio_bak = audio.copy()
+#     if sampling_rate != 16000:
+#         audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
+#     model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/test-RMVPE/weights/rmvpe_llc_half.pt"
+#     thred = 0.03  # 0.01
+#     device = 'cuda' if torch.cuda.is_available() else 'cpu'
+#     rmvpe = RMVPE(model_path,is_half=False, device=device)
+#     t0=ttime()
+#     f0 = rmvpe.infer_from_audio(audio, thred=thred)
+#     f0 = rmvpe.infer_from_audio(audio, thred=thred)
+#     f0 = rmvpe.infer_from_audio(audio, thred=thred)
+#     f0 = rmvpe.infer_from_audio(audio, thred=thred)
+#     f0 = rmvpe.infer_from_audio(audio, thred=thred)
+#     t1=ttime()
+#     print(f0.shape,t1-t0)