AICoverGen

Sleeping

App Files Files Community

Zeph27 commited on Aug 10

Commit

aef55a8

•

1 Parent(s): d060f6a

init

Browse files

Files changed (24) hide show

.gitignore +170 -0
app.py +302 -4
mdxnet_models/model_data.json +340 -0
requirements.txt +26 -0
src/configs/32k.json +46 -0
src/configs/32k_v2.json +46 -0
src/configs/40k.json +46 -0
src/configs/48k.json +46 -0
src/configs/48k_v2.json +46 -0
src/download_models.py +31 -0
src/infer_pack/attentions.py +417 -0
src/infer_pack/commons.py +166 -0
src/infer_pack/models.py +1124 -0
src/infer_pack/models_onnx.py +818 -0
src/infer_pack/models_onnx_moess.py +849 -0
src/infer_pack/modules.py +522 -0
src/infer_pack/transforms.py +209 -0
src/main.py +362 -0
src/mdx.py +327 -0
src/my_utils.py +21 -0
src/rmvpe.py +409 -0
src/rvc.py +151 -0
src/trainset_preprocess_pipeline_print.py +146 -0
src/vc_infer_pipeline.py +653 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,170 @@

+# MDX Models
+mdxnet_models/*.onnx
+# RVC Models
+rvc_models/*/*.pth
+rvc_models/*/*.index
+rvc_models/*/*.npy
+rvc_models/*/*.zip
+rvc_models/*/*.rar
+rvc_models/hubert_base.pt
+rvc_models/rmvpe.pt
+# Output
+song_output/*/*.wav
+song_output/*/*.mp3
+AICoverGen/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

app.py CHANGED Viewed

@@ -1,7 +1,305 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import json
+import os
+import shutil
+import urllib.request
+import zipfile
+import gdown
+from argparse import ArgumentParser
 import gradio as gr
+from src.main import song_cover_pipeline
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+mdxnet_models_dir = 'mdxnet_models'
+rvc_models_dir = 'rvc_models'
+output_dir = 'song_output'
+def download_and_extract_model(model_url, model_name, progress=gr.Progress()):
+    try:
+        os.makedirs(rvc_models_dir, exist_ok=True)
+        extraction_folder = os.path.join(rvc_models_dir, model_name)
+        zip_path = os.path.join(rvc_models_dir, f'{model_name}.zip')
+        if os.path.exists(extraction_folder):
+            raise gr.Error(f'Voice model directory {model_name} already exists! Choose a different name for your voice model.')
+        progress(0, desc=f'[~] Downloading voice model with name {model_name}...')
+        try:
+            if 'huggingface.co' in model_url:
+                urllib.request.urlretrieve(model_url, zip_path)
+            elif 'pixeldrain.com' in model_url:
+                pixeldrain_id = model_url.split('/')[-1]
+                pixeldrain_url = f'https://pixeldrain.com/api/file/{pixeldrain_id}'
+                urllib.request.urlretrieve(pixeldrain_url, zip_path)
+            elif 'drive.google.com' in model_url:
+                file_id = model_url.split('/')[-2]
+                gdown.download(id=file_id, output=zip_path, quiet=False)
+            else:
+                urllib.request.urlretrieve(model_url, zip_path)
+        except Exception as download_error:
+            raise gr.Error(f"Failed to download the model: {str(download_error)}")
+        if not os.path.exists(zip_path):
+            raise gr.Error(f"Failed to download the model. The zip file was not created.")
+        progress(0.5, desc="Extracting model...")
+        extract_zip(extraction_folder, zip_path)
+        pth_files = [f for f in os.listdir(extraction_folder) if f.endswith('.pth')]
+        if not pth_files:
+            raise ValueError("No .pth file found in the downloaded model.")
+        progress(1, desc="Model ready")
+        return model_name
+    except Exception as e:
+        if os.path.exists(extraction_folder):
+            shutil.rmtree(extraction_folder)
+        if os.path.exists(zip_path):
+            os.remove(zip_path)
+        raise gr.Error(f"Error downloading or extracting model: {str(e)}")
+def cleanup_temp_model(model_name):
+    temp_dir = os.path.join(rvc_models_dir, model_name)
+    try:
+        shutil.rmtree(temp_dir)
+    except Exception as e:
+        print(f"Error cleaning up temporary model files: {str(e)}")
+def extract_zip(extraction_folder, zip_name):
+    os.makedirs(extraction_folder)
+    with zipfile.ZipFile(zip_name, 'r') as zip_ref:
+        zip_ref.extractall(extraction_folder)
+    os.remove(zip_name)
+    index_filepath, model_filepath = None, None
+    for root, dirs, files in os.walk(extraction_folder):
+        for name in files:
+            if name.endswith('.index') and os.stat(os.path.join(root, name)).st_size > 1024 * 100:
+                index_filepath = os.path.join(root, name)
+            if name.endswith('.pth') and os.stat(os.path.join(root, name)).st_size > 1024 * 1024 * 40:
+                model_filepath = os.path.join(root, name)
+    if not model_filepath:
+        raise gr.Error(f'No .pth model file was found in the extracted zip. Please check {extraction_folder}.')
+    # move model and index file to extraction folder
+    os.rename(model_filepath, os.path.join(extraction_folder, os.path.basename(model_filepath)))
+    if index_filepath:
+        os.rename(index_filepath, os.path.join(extraction_folder, os.path.basename(index_filepath)))
+    # remove any unnecessary nested folders
+    for filepath in os.listdir(extraction_folder):
+        if os.path.isdir(os.path.join(extraction_folder, filepath)):
+            shutil.rmtree(os.path.join(extraction_folder, filepath))
+def download_online_model(url, dir_name, progress=gr.Progress()):
+    try:
+        progress(0, desc=f'[~] Downloading voice model with name {dir_name}...')
+        zip_name = url.split('/')[-1]
+        extraction_folder = os.path.join(rvc_models_dir, dir_name)
+        if os.path.exists(extraction_folder):
+            raise gr.Error(f'Voice model directory {dir_name} already exists! Choose a different name for your voice model.')
+        if 'huggingface.co' in url:
+            urllib.request.urlretrieve(url, zip_name)
+        if 'pixeldrain.com' in url:
+            zip_name = dir_name + '.zip'
+            url = f'https://pixeldrain.com/api/file/{zip_name}'
+            urllib.request.urlretrieve(url, zip_name)
+        elif 'drive.google.com' in url:
+            # Extract the Google Drive file ID
+            zip_name = dir_name + '.zip'
+            file_id = url.split('/')[-2]
+            output = os.path.join('.', f'{dir_name}.zip')  # Adjust the output path if needed
+            gdown.download(id=file_id, output=output, quiet=False)
+        progress(0.5, desc='[~] Extracting zip...')
+        extract_zip(extraction_folder, zip_name)
+        return f'[+] {dir_name} Model successfully downloaded!'
+    except Exception as e:
+        raise gr.Error(str(e))
+def upload_local_model(zip_path, dir_name, progress=gr.Progress()):
+    try:
+        extraction_folder = os.path.join(rvc_models_dir, dir_name)
+        if os.path.exists(extraction_folder):
+            raise gr.Error(f'Voice model directory {dir_name} already exists! Choose a different name for your voice model.')
+        zip_name = zip_path.name
+        progress(0.5, desc='[~] Extracting zip...')
+        extract_zip(extraction_folder, zip_name)
+        return f'[+] {dir_name} Model successfully uploaded!'
+    except Exception as e:
+        raise gr.Error(str(e))
+def pub_dl_autofill(pub_models, event: gr.SelectData):
+    return gr.Text.update(value=pub_models.loc[event.index[0], 'URL']), gr.Text.update(value=pub_models.loc[event.index[0], 'Model Name'])
+def swap_visibility():
+    return gr.update(visible=True), gr.update(visible=False), gr.update(value=''), gr.update(value=None)
+def process_file_upload(file):
+    return file.name, gr.update(value=file.name)
+def show_hop_slider(pitch_detection_algo):
+    if pitch_detection_algo == 'mangio-crepe':
+        return gr.update(visible=True)
+    else:
+        return gr.update(visible=False)
+def song_cover_pipeline_with_model_download(song_input, model_url, model_name, pitch, keep_files, is_webui, main_gain, backup_gain,
+                                            inst_gain, index_rate, filter_radius, rms_mix_rate, f0_method, crepe_hop_length,
+                                            protect, pitch_all, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping,
+                                            output_format, progress=gr.Progress()):
+    model_path = None
+    try:
+        model_path = download_and_extract_model(model_url, model_name, progress)
+        print(f"Model path: {model_path}")
+        result = song_cover_pipeline(song_input, model_path, pitch, keep_files, is_webui, main_gain, backup_gain,
+                                     inst_gain, index_rate, filter_radius, rms_mix_rate, f0_method, crepe_hop_length,
+                                     protect, pitch_all, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping,
+                                     output_format, progress)
+        # Clean up old folders in song_output
+        output_folders = [f for f in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, f))]
+        output_folders.sort(key=lambda x: os.path.getmtime(os.path.join(output_dir, x)))
+        while len(output_folders) > 100:
+            oldest_folder = output_folders.pop(0)
+            shutil.rmtree(os.path.join(output_dir, oldest_folder))
+        return result
+    except gr.Error as e:
+        return str(e), None  # Return error message and None for the second output
+    finally:
+        if model_path:
+            cleanup_temp_model(model_path)
+if __name__ == '__main__':
+    parser = ArgumentParser(description='Generate a AI cover song in the song_output/id directory.', add_help=True)
+    parser.add_argument("--share", action="store_true", dest="share_enabled", default=False, help="Enable sharing")
+    parser.add_argument("--listen", action="store_true", default=False, help="Make the WebUI reachable from your local network.")
+    parser.add_argument('--listen-host', type=str, help='The hostname that the server will use.')
+    parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.')
+    args = parser.parse_args()
+    with gr.Blocks(title='AICoverGenWebUI', theme='NoCrypt/[email protected]') as app:
+        gr.Label('AICoverGen WebUI created with ❤️', show_label=False)
+        # main tab
+        with gr.Tab("Generate"):
+            with gr.Accordion('Main Options'):
+                with gr.Row():
+                    with gr.Column():
+                        model_url = gr.Text(label='Voice Model URL', info='Enter the URL of the voice model zip file', value='https://huggingface.co/megaaziib/my-rvc-models-collection/resolve/main/kobo.zip')
+                        model_name = gr.Text(label='Voice Model Name', info='Enter the name of the voice model', value='kobo')
+                        # rvc_model = gr.Dropdown(voice_models, label='Voice Models', info='Models folder "AICoverGen --> rvc_models". After new models are added into this folder, click the refresh button')
+                    with gr.Column() as yt_link_col:
+                        song_input = gr.Text(label='Song input', info='Link to a song on YouTube or full path to a local file. For file upload, click the button below.', value='https://youtu.be/FRh7LvlQTuA')
+                        show_file_upload_button = gr.Button('Upload file instead')
+                    with gr.Column(visible=False) as file_upload_col:
+                        local_file = gr.File(label='Audio file')
+                        song_input_file = gr.UploadButton('Upload 📂', file_types=['audio'], variant='primary')
+                        show_yt_link_button = gr.Button('Paste YouTube link/Path to local file instead')
+                        song_input_file.upload(process_file_upload, inputs=[song_input_file], outputs=[local_file, song_input])
+                    with gr.Column():
+                        pitch = gr.Slider(-24, 24, value=0, step=1, label='Pitch Change (Vocals ONLY)', info='Generally, use 12 for male to female conversions and -12 for vice-versa. (Octaves)')
+                        pitch_all = gr.Slider(-12, 12, value=0, step=1, label='Overall Pitch Change', info='Changes pitch/key of vocals and instrumentals together. Altering this slightly reduces sound quality. (Semitones)')
+                    show_file_upload_button.click(swap_visibility, outputs=[file_upload_col, yt_link_col, song_input, local_file])
+                    show_yt_link_button.click(swap_visibility, outputs=[yt_link_col, file_upload_col, song_input, local_file])
+            with gr.Accordion('Voice conversion options', open=False):
+                with gr.Row():
+                    index_rate = gr.Slider(0, 1, value=0.5, label='Index Rate', info="Controls how much of the AI voice's accent to keep in the vocals")
+                    filter_radius = gr.Slider(0, 7, value=3, step=1, label='Filter radius', info='If >=3: apply median filtering median filtering to the harvested pitch results. Can reduce breathiness')
+                    rms_mix_rate = gr.Slider(0, 1, value=0.25, label='RMS mix rate', info="Control how much to mimic the original vocal's loudness (0) or a fixed loudness (1)")
+                    protect = gr.Slider(0, 0.5, value=0.33, label='Protect rate', info='Protect voiceless consonants and breath sounds. Set to 0.5 to disable.')
+                    with gr.Column():
+                        f0_method = gr.Dropdown(['rmvpe', 'mangio-crepe'], value='rmvpe', label='Pitch detection algorithm', info='Best option is rmvpe (clarity in vocals), then mangio-crepe (smoother vocals)')
+                        crepe_hop_length = gr.Slider(32, 320, value=128, step=1, visible=False, label='Crepe hop length', info='Lower values leads to longer conversions and higher risk of voice cracks, but better pitch accuracy.')
+                        f0_method.change(show_hop_slider, inputs=f0_method, outputs=crepe_hop_length)
+                keep_files = gr.Checkbox(label='Keep intermediate files', info='Keep all audio files generated in the song_output/id directory, e.g. Isolated Vocals/Instrumentals. Leave unchecked to save space')
+            with gr.Accordion('Audio mixing options', open=False):
+                gr.Markdown('### Volume Change (decibels)')
+                with gr.Row():
+                    main_gain = gr.Slider(-20, 20, value=0, step=1, label='Main Vocals')
+                    backup_gain = gr.Slider(-20, 20, value=0, step=1, label='Backup Vocals')
+                    inst_gain = gr.Slider(-20, 20, value=0, step=1, label='Music')
+                gr.Markdown('### Reverb Control on AI Vocals')
+                with gr.Row():
+                    reverb_rm_size = gr.Slider(0, 1, value=0.15, label='Room size', info='The larger the room, the longer the reverb time')
+                    reverb_wet = gr.Slider(0, 1, value=0.2, label='Wetness level', info='Level of AI vocals with reverb')
+                    reverb_dry = gr.Slider(0, 1, value=0.8, label='Dryness level', info='Level of AI vocals without reverb')
+                    reverb_damping = gr.Slider(0, 1, value=0.7, label='Damping level', info='Absorption of high frequencies in the reverb')
+                gr.Markdown('### Audio Output Format')
+                output_format = gr.Dropdown(['mp3', 'wav'], value='mp3', label='Output file type', info='mp3: small file size, decent quality. wav: Large file size, best quality')
+            with gr.Row():
+                clear_btn = gr.ClearButton(value='Clear', components=[song_input, model_url, keep_files, local_file])
+                generate_btn = gr.Button("Generate", variant='primary')
+            with gr.Row():
+                ai_cover = gr.Audio(label='AI Cover (Vocal Only Inference)', show_share_button=False)
+                ai_backing = gr.Audio(label='AI Cover (Vocal Backing Inference)', show_share_button=False)
+            is_webui = gr.Number(value=1, visible=False)
+            generate_btn.click(song_cover_pipeline_with_model_download,
+                inputs=[song_input, model_url, model_name, pitch, keep_files, is_webui, main_gain, backup_gain,
+                        inst_gain, index_rate, filter_radius, rms_mix_rate, f0_method, crepe_hop_length,
+                        protect, pitch_all, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping,
+                        output_format],
+                outputs=[ai_cover, ai_backing])
+            clear_btn.click(lambda: [0, 0, 0, 0, 0.5, 3, 0.25, 0.33, 'rmvpe', 128, 0, 0.15, 0.2, 0.8, 0.7, 'mp3', None],
+                outputs=[pitch, main_gain, backup_gain, inst_gain, index_rate, filter_radius, rms_mix_rate,
+                            protect, f0_method, crepe_hop_length, pitch_all, reverb_rm_size, reverb_wet,
+                            reverb_dry, reverb_damping, output_format, ai_cover])
+        # Upload tab
+        with gr.Tab('Upload model'):
+            gr.Markdown('## Upload locally trained RVC v2 model and index file')
+            gr.Markdown('- Find model file (weights folder) and optional index file (logs/[name] folder)')
+            gr.Markdown('- Compress files into zip file')
+            gr.Markdown('- Upload zip file and give unique name for voice')
+            gr.Markdown('- Click Upload model')
+            with gr.Row():
+                with gr.Column():
+                    zip_file = gr.File(label='Zip file')
+                local_model_name = gr.Text(label='Model name')
+            with gr.Row():
+                model_upload_button = gr.Button('Upload model', variant='primary', scale=19)
+                local_upload_output_message = gr.Text(label='Output Message', interactive=False, scale=20)
+                model_upload_button.click(upload_local_model, inputs=[zip_file, local_model_name], outputs=local_upload_output_message)
+    app.launch(
+        share=args.share_enabled,
+        server_name=None if not args.listen else (args.listen_host or '0.0.0.0'),
+        server_port=args.listen_port,
+    )

mdxnet_models/model_data.json ADDED Viewed

	@@ -0,0 +1,340 @@

+{
+    "0ddfc0eb5792638ad5dc27850236c246": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "26d308f91f3423a67dc69a6d12a8793d": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 8192,
+        "primary_stem": "Other"
+    },
+    "2cdd429caac38f0194b133884160f2c6": {
+        "compensate": 1.045,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "2f5501189a2f6db6349916fabe8c90de": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "398580b6d5d973af3120df54cee6759d": {
+        "compensate": 1.75,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "488b3e6f8bd3717d9d7c428476be2d75": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "4910e7827f335048bdac11fa967772f9": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 7,
+        "mdx_n_fft_scale_set": 4096,
+        "primary_stem": "Drums"
+    },
+    "53c4baf4d12c3e6c3831bb8f5b532b93": {
+        "compensate": 1.043,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "5d343409ef0df48c7d78cce9f0106781": {
+        "compensate": 1.075,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "5f6483271e1efb9bfb59e4a3e6d4d098": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "65ab5919372a128e4167f5e01a8fda85": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 8192,
+        "primary_stem": "Other"
+    },
+    "6703e39f36f18aa7855ee1047765621d": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 16384,
+        "primary_stem": "Bass"
+    },
+    "6b31de20e84392859a3d09d43f089515": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "867595e9de46f6ab699008295df62798": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "a3cd63058945e777505c01d2507daf37": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "b33d9b3950b6cbf5fe90a32608924700": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "c3b29bdce8c4fa17ec609e16220330ab": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 16384,
+        "primary_stem": "Bass"
+    },
+    "ceed671467c1f64ebdfac8a2490d0d52": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "d2a1376f310e4f7fa37fb9b5774eb701": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "d7bff498db9324db933d913388cba6be": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "d94058f8c7f1fae4164868ae8ae66b20": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Vocals"
+    },
+    "dc41ede5961d50f277eb846db17f5319": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 4096,
+        "primary_stem": "Drums"
+    },
+    "e5572e58abf111f80d8241d2e44e7fa4": {
+        "compensate": 1.028,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "e7324c873b1f615c35c1967f912db92a": {
+        "compensate": 1.03,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "1c56ec0224f1d559c42fd6fd2a67b154": {
+        "compensate": 1.025,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 5120,
+        "primary_stem": "Instrumental"
+    },
+    "f2df6d6863d8f435436d8b561594ff49": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "b06327a00d5e5fbc7d96e1781bbdb596": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "94ff780b977d3ca07c7a343dab2e25dd": {
+        "compensate": 1.039,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "73492b58195c3b52d34590d5474452f6": {
+        "compensate": 1.043,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "970b3f9492014d18fefeedfe4773cb42": {
+        "compensate": 1.009,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "1d64a6d2c30f709b8c9b4ce1366d96ee": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 5120,
+        "primary_stem": "Instrumental"
+    },
+    "203f2a3955221b64df85a41af87cf8f0": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "291c2049608edb52648b96e27eb80e95": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "ead8d05dab12ec571d67549b3aab03fc": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "cc63408db3d80b4d85b0287d1d7c9632": {
+        "compensate": 1.033,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "cd5b2989ad863f116c855db1dfe24e39": {
+        "compensate": 1.035,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 9,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Other"
+    },
+    "55657dd70583b0fedfba5f67df11d711": {
+        "compensate": 1.022,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 6144,
+        "primary_stem": "Instrumental"
+    },
+    "b6bccda408a436db8500083ef3491e8b": {
+        "compensate": 1.02,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "8a88db95c7fb5dbe6a095ff2ffb428b1": {
+        "compensate": 1.026,
+        "mdx_dim_f_set": 2048,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 5120,
+        "primary_stem": "Instrumental"
+    },
+    "b78da4afc6512f98e4756f5977f5c6b9": {
+        "compensate": 1.021,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Instrumental"
+    },
+    "77d07b2667ddf05b9e3175941b4454a0": {
+        "compensate": 1.021,
+        "mdx_dim_f_set": 3072,
+        "mdx_dim_t_set": 8,
+        "mdx_n_fft_scale_set": 7680,
+        "primary_stem": "Vocals"
+    },
+    "2154254ee89b2945b97a7efed6e88820": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "063aadd735d58150722926dcbf5852a9": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "fe96801369f6a148df2720f5ced88c19": {
+        "config_yaml": "model3.yaml"
+    },
+    "02e8b226f85fb566e5db894b9931c640": {
+        "config_yaml": "model2.yaml"
+    },
+    "e3de6d861635ab9c1d766149edd680d6": {
+        "config_yaml": "model1.yaml"
+    },
+    "3f2936c554ab73ce2e396d54636bd373": {
+        "config_yaml": "modelB.yaml"
+    },
+    "890d0f6f82d7574bca741a9e8bcb8168": {
+        "config_yaml": "modelB.yaml"
+    },
+    "63a3cb8c37c474681049be4ad1ba8815": {
+        "config_yaml": "modelB.yaml"
+    },
+    "a7fc5d719743c7fd6b61bd2b4d48b9f0": {
+        "config_yaml": "modelA.yaml"
+    },
+    "3567f3dee6e77bf366fcb1c7b8bc3745": {
+        "config_yaml": "modelA.yaml"
+    },
+    "a28f4d717bd0d34cd2ff7a3b0a3d065e": {
+        "config_yaml": "modelA.yaml"
+    },
+    "c9971a18da20911822593dc81caa8be9": {
+        "config_yaml": "sndfx.yaml"
+    },
+    "57d94d5ed705460d21c75a5ac829a605": {
+        "config_yaml": "sndfx.yaml"
+    },
+    "e7a25f8764f25a52c1b96c4946e66ba2": {
+        "config_yaml": "sndfx.yaml"
+    },
+    "104081d24e37217086ce5fde09147ee1": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "1e6165b601539f38d0a9330f3facffeb": {
+        "config_yaml": "model_2_stem_061321.yaml"
+    },
+    "fe0108464ce0d8271be5ab810891bd7c": {
+        "config_yaml": "model_2_stem_full_band.yaml"
+    }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+deemix
+fairseq==0.12.2
+faiss-cpu==1.7.3
+ffmpeg-python>=0.2.0
+gradio==4.29.0
+lib==4.0.0
+librosa==0.10
+numpy==1.23.5
+onnxruntime_gpu
+praat-parselmouth>=0.4.2
+pedalboard==0.7.7
+pydub==0.25.1
+pyworld==0.3.4
+Requests==2.31.0
+scipy>=1.13.0,<2.0.0
+soundfile==0.12.1
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.0.1
+torchvision==0.15.2
+torchaudio==2.0.2
+torchcrepe==0.0.23
+tqdm==4.65.0
+yt_dlp==2023.7.6
+sox==1.4.1
+audio-separator[gpu]==0.17.5
+gdown==5.2.0

src/configs/32k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 1024,
+    "hop_length": 320,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,4,2,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

src/configs/32k_v2.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 1024,
+    "hop_length": 320,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [20,16,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

src/configs/40k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 40000,
+    "filter_length": 2048,
+    "hop_length": 400,
+    "win_length": 2048,
+    "n_mel_channels": 125,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,10,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

src/configs/48k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 11520,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 48000,
+    "filter_length": 2048,
+    "hop_length": 480,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,6,2,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

src/configs/48k_v2.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 17280,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 48000,
+    "filter_length": 2048,
+    "hop_length": 480,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [12,10,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [24,20,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

src/download_models.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from pathlib import Path
+import requests
+MDX_DOWNLOAD_LINK = 'https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/'
+RVC_DOWNLOAD_LINK = 'https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/'
+BASE_DIR = Path(__file__).resolve().parent.parent
+mdxnet_models_dir = BASE_DIR / 'mdxnet_models'
+rvc_models_dir = BASE_DIR / 'rvc_models'
+def dl_model(link, model_name, dir_name):
+    with requests.get(f'{link}{model_name}') as r:
+        r.raise_for_status()
+        with open(dir_name / model_name, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+if __name__ == '__main__':
+    mdx_model_names = ['UVR-MDX-NET-Voc_FT.onnx', 'UVR_MDXNET_KARA_2.onnx', 'Reverb_HQ_By_FoxJoy.onnx', 'Kim_Vocal_2.onnx']
+    for model in mdx_model_names:
+        print(f'Downloading {model}...')
+        dl_model(MDX_DOWNLOAD_LINK, model, mdxnet_models_dir)
+    rvc_model_names = ['hubert_base.pt', 'rmvpe.pt']
+    for model in rvc_model_names:
+        print(f'Downloading {model}...')
+        dl_model(RVC_DOWNLOAD_LINK, model, rvc_models_dir)
+    print('All models downloaded!')

src/infer_pack/attentions.py ADDED Viewed

	@@ -0,0 +1,417 @@

+import copy
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from src.infer_pack import commons
+from src.infer_pack import modules
+from src.infer_pack.modules import LayerNorm
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        window_size=10,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    window_size=window_size,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        proximal_bias=False,
+        proximal_init=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.drop = nn.Dropout(p_dropout)
+        self.self_attn_layers = nn.ModuleList()
+        self.norm_layers_0 = nn.ModuleList()
+        self.encdec_attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.self_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    proximal_bias=proximal_bias,
+                    proximal_init=proximal_init,
+                )
+            )
+            self.norm_layers_0.append(LayerNorm(hidden_channels))
+            self.encdec_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                    causal=True,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask, h, h_mask):
+        """
+        x: decoder input
+        h: encoder output
+        """
+        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
+            device=x.device, dtype=x.dtype
+        )
+        encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.self_attn_layers[i](x, x, self_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_0[i](x + y)
+            y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        n_heads,
+        p_dropout=0.0,
+        window_size=None,
+        heads_share=True,
+        block_length=None,
+        proximal_bias=False,
+        proximal_init=False,
+    ):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+            self.emb_rel_v = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert (
+                t_s == t_t
+            ), "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query / math.sqrt(self.k_channels), key_relative_embeddings
+            )
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(
+                device=scores.device, dtype=scores.dtype
+            )
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert (
+                    t_s == t_t
+                ), "Local attention is only available for self-attention."
+                block_mask = (
+                    torch.ones_like(scores)
+                    .triu(-self.block_length)
+                    .tril(self.block_length)
+                )
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s
+            )
+            output = output + self._matmul_with_relative_values(
+                relative_weights, value_relative_embeddings
+            )
+        output = (
+            output.transpose(2, 3).contiguous().view(b, d, t_t)
+        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[
+            :, slice_start_position:slice_end_position
+        ]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(
+            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+        )
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+            :, :, :length, length - 1 :
+        ]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(
+            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+        )
+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout=0.0,
+        activation=None,
+        causal=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x * x_mask))
+        return x * x_mask
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x

src/infer_pack/commons.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+    """KL(P||Q)"""
+    kl = (logs_q - logs_p) - 0.5
+    kl += (
+        0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
+    )
+    return kl
+def rand_gumbel(shape):
+    """Sample from the Gumbel distribution, protect from overflows."""
+    uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+    return -torch.log(-torch.log(uniform_samples))
+def rand_gumbel_like(x):
+    g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+    return g
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def slice_segments2(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, idx_str:idx_end]
+    return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
+    position = torch.arange(length, dtype=torch.float)
+    num_timescales = channels // 2
+    log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
+        num_timescales - 1
+    )
+    inv_timescales = min_timescale * torch.exp(
+        torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
+    )
+    scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+    signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+    signal = F.pad(signal, [0, 0, 0, channels % 2])
+    signal = signal.view(1, channels, length)
+    return signal
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return x + signal.to(dtype=x.dtype, device=x.device)
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+def subsequent_mask(length):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def shift_1d(x):
+    x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+    return x
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    device = duration.device
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path.unsqueeze(1).transpose(2, 3) * mask
+    return path
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    if clip_value is not None:
+        clip_value = float(clip_value)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1.0 / norm_type)
+    return total_norm

src/infer_pack/models.py ADDED Viewed

	@@ -0,0 +1,1124 @@

+import math, pdb, os
+from time import time as ttime
+import torch
+from torch import nn
+from torch.nn import functional as F
+from src.infer_pack import modules
+from src.infer_pack import attentions
+from src.infer_pack import commons
+from src.infer_pack.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from src.infer_pack.commons import init_weights
+import numpy as np
+from src.infer_pack import commons
+class TextEncoder256(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+class TextEncoder768(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(768, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def forward(self, f0, upp):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0 = f0[:, None].transpose(1, 2)
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                    idx + 2
+                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
+            rand_ini = torch.rand(
+                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+            )
+            rand_ini[:, 0] = 0
+            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+            tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
+            tmp_over_one *= upp
+            tmp_over_one = F.interpolate(
+                tmp_over_one.transpose(2, 1),
+                scale_factor=upp,
+                mode="linear",
+                align_corners=True,
+            ).transpose(2, 1)
+            rad_values = F.interpolate(
+                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(
+                2, 1
+            )  #######
+            tmp_over_one %= 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sine_waves = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+            sine_waves = sine_waves * self.sine_amp
+            uv = self._f02uv(f0)
+            uv = F.interpolate(
+                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(2, 1)
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(
+        self,
+        sampling_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+        is_half=True,
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.is_half = is_half
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x, upp=None):
+        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
+        if self.is_half:
+            sine_wavs = sine_wavs.half()
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge, None, None  # noise, uv
+class GeneratorNSF(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels,
+        sr,
+        is_half=False,
+    ):
+        super(GeneratorNSF, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sr, harmonic_num=0, is_half=is_half
+        )
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1 :])
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+        self.upp = np.prod(upsample_rates)
+    def forward(self, x, f0, g=None):
+        har_source, noi_source, uv = self.m_source(f0, self.upp)
+        har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+sr2sr = {
+    "32k": 32000,
+    "40k": 40000,
+    "48k": 48000,
+}
+class SynthesizerTrnMs256NSFsid(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
+    ):  # 这里ds是id，[bs,1]
+        # print(1,pitch.shape)#[bs,t]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
+        pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
+        # print(-2,pitchf.shape,z_slice.shape)
+        o = self.dec(z_slice, pitchf, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+    def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+class SynthesizerTrnMs768NSFsid(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder768(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
+    ):  # 这里ds是id，[bs,1]
+        # print(1,pitch.shape)#[bs,t]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
+        pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
+        # print(-2,pitchf.shape,z_slice.shape)
+        o = self.dec(z_slice, pitchf, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+    def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+class SynthesizerTrnMs256NSFsid_nono(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr=None,
+        **kwargs
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            f0=False,
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(self, phone, phone_lengths, y, y_lengths, ds):  # 这里ds是id，[bs,1]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        o = self.dec(z_slice, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+    def infer(self, phone, phone_lengths, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+class SynthesizerTrnMs768NSFsid_nono(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr=None,
+        **kwargs
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder768(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            f0=False,
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(self, phone, phone_lengths, y, y_lengths, ds):  # 这里ds是id，[bs,1]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        o = self.dec(z_slice, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+    def infer(self, phone, phone_lengths, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11, 17]
+        # periods = [3, 5, 7, 11, 17, 23, 37]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class MultiPeriodDiscriminatorV2(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminatorV2, self).__init__()
+        # periods = [2, 3, 5, 7, 11, 17]
+        periods = [2, 3, 5, 7, 11, 17, 23, 37]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap

src/infer_pack/models_onnx.py ADDED Viewed

	@@ -0,0 +1,818 @@

+import math, pdb, os
+from time import time as ttime
+import torch
+from torch import nn
+from torch.nn import functional as F
+from infer_pack import modules
+from infer_pack import attentions
+from infer_pack import commons
+from infer_pack.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from infer_pack.commons import init_weights
+import numpy as np
+from infer_pack import commons
+class TextEncoder256(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+class TextEncoder768(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(768, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def forward(self, f0, upp):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0 = f0[:, None].transpose(1, 2)
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                    idx + 2
+                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
+            rand_ini = torch.rand(
+                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+            )
+            rand_ini[:, 0] = 0
+            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+            tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
+            tmp_over_one *= upp
+            tmp_over_one = F.interpolate(
+                tmp_over_one.transpose(2, 1),
+                scale_factor=upp,
+                mode="linear",
+                align_corners=True,
+            ).transpose(2, 1)
+            rad_values = F.interpolate(
+                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(
+                2, 1
+            )  #######
+            tmp_over_one %= 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sine_waves = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+            sine_waves = sine_waves * self.sine_amp
+            uv = self._f02uv(f0)
+            uv = F.interpolate(
+                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(2, 1)
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(
+        self,
+        sampling_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+        is_half=True,
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.is_half = is_half
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x, upp=None):
+        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
+        if self.is_half:
+            sine_wavs = sine_wavs.half()
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge, None, None  # noise, uv
+class GeneratorNSF(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels,
+        sr,
+        is_half=False,
+    ):
+        super(GeneratorNSF, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sr, harmonic_num=0, is_half=is_half
+        )
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1 :])
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+        self.upp = np.prod(upsample_rates)
+    def forward(self, x, f0, g=None):
+        har_source, noi_source, uv = self.m_source(f0, self.upp)
+        har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+sr2sr = {
+    "32k": 32000,
+    "40k": 40000,
+    "48k": 48000,
+}
+class SynthesizerTrnMsNSFsidM(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        if self.gin_channels == 256:
+            self.enc_p = TextEncoder256(
+                inter_channels,
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers,
+                kernel_size,
+                p_dropout,
+            )
+        else:
+            self.enc_p = TextEncoder768(
+                inter_channels,
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers,
+                kernel_size,
+                p_dropout,
+            )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        self.speaker_map = None
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def construct_spkmixmap(self, n_speaker):
+        self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
+        for i in range(n_speaker):
+            self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
+        self.speaker_map = self.speaker_map.unsqueeze(0)
+    def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
+        if self.speaker_map is not None:  # [N, S]  *  [S, B, 1, H]
+            g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1))  # [N, S, B, 1, 1]
+            g = g * self.speaker_map  # [N, S, B, 1, H]
+            g = torch.sum(g, dim=1)  # [N, 1, B, 1, H]
+            g = g.transpose(0, -1).transpose(0, -2).squeeze(0)  # [B, H, N]
+        else:
+            g = g.unsqueeze(0)
+            g = self.emb_g(g).transpose(1, 2)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11, 17]
+        # periods = [3, 5, 7, 11, 17, 23, 37]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class MultiPeriodDiscriminatorV2(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminatorV2, self).__init__()
+        # periods = [2, 3, 5, 7, 11, 17]
+        periods = [2, 3, 5, 7, 11, 17, 23, 37]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap

src/infer_pack/models_onnx_moess.py ADDED Viewed

	@@ -0,0 +1,849 @@

+import math, pdb, os
+from time import time as ttime
+import torch
+from torch import nn
+from torch.nn import functional as F
+from infer_pack import modules
+from infer_pack import attentions
+from infer_pack import commons
+from infer_pack.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from infer_pack.commons import init_weights
+import numpy as np
+from infer_pack import commons
+class TextEncoder256(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+class TextEncoder256Sim(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        x = self.proj(x) * x_mask
+        return x, x_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def forward(self, f0, upp):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0 = f0[:, None].transpose(1, 2)
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                    idx + 2
+                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
+            rand_ini = torch.rand(
+                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+            )
+            rand_ini[:, 0] = 0
+            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+            tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
+            tmp_over_one *= upp
+            tmp_over_one = F.interpolate(
+                tmp_over_one.transpose(2, 1),
+                scale_factor=upp,
+                mode="linear",
+                align_corners=True,
+            ).transpose(2, 1)
+            rad_values = F.interpolate(
+                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(
+                2, 1
+            )  #######
+            tmp_over_one %= 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sine_waves = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+            sine_waves = sine_waves * self.sine_amp
+            uv = self._f02uv(f0)
+            uv = F.interpolate(
+                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(2, 1)
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(
+        self,
+        sampling_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+        is_half=True,
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.is_half = is_half
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x, upp=None):
+        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
+        if self.is_half:
+            sine_wavs = sine_wavs.half()
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge, None, None  # noise, uv
+class GeneratorNSF(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels,
+        sr,
+        is_half=False,
+    ):
+        super(GeneratorNSF, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sr, harmonic_num=0, is_half=is_half
+        )
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1 :])
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+        self.upp = np.prod(upsample_rates)
+    def forward(self, x, f0, g=None):
+        har_source, noi_source, uv = self.m_source(f0, self.upp)
+        har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+sr2sr = {
+    "32k": 32000,
+    "40k": 40000,
+    "48k": 48000,
+}
+class SynthesizerTrnMs256NSFsidM(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o
+class SynthesizerTrnMs256NSFsid_sim(nn.Module):
+    """
+    Synthesizer for Training
+    """
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        # hop_length,
+        gin_channels=0,
+        use_sdp=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256Sim(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            is_half=kwargs["is_half"],
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
+    ):  # y是spec不需要了现在
+        g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        x, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        x = self.flow(x, x_mask, g=g, reverse=True)
+        o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
+        return o
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11, 17]
+        # periods = [3, 5, 7, 11, 17, 23, 37]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap

src/infer_pack/modules.py ADDED Viewed

	@@ -0,0 +1,522 @@

+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+from src.infer_pack import commons
+from src.infer_pack.commons import init_weights, get_padding
+from src.infer_pack.transforms import piecewise_rational_quadratic_transform
+LRELU_SLOPE = 0.1
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+class ConvReluNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        out_channels,
+        kernel_size,
+        n_layers,
+        p_dropout,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        assert n_layers > 1, "Number of layers should be larger than 0."
+        self.conv_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.conv_layers.append(
+            nn.Conv1d(
+                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+            )
+        )
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(
+                nn.Conv1d(
+                    hidden_channels,
+                    hidden_channels,
+                    kernel_size,
+                    padding=kernel_size // 2,
+                )
+            )
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask
+class DDSConv(nn.Module):
+    """
+    Dialted and Depth-Separable Convolution
+    """
+    def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
+        super().__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        self.drop = nn.Dropout(p_dropout)
+        self.convs_sep = nn.ModuleList()
+        self.convs_1x1 = nn.ModuleList()
+        self.norms_1 = nn.ModuleList()
+        self.norms_2 = nn.ModuleList()
+        for i in range(n_layers):
+            dilation = kernel_size**i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs_sep.append(
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    groups=channels,
+                    dilation=dilation,
+                    padding=padding,
+                )
+            )
+            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+            self.norms_1.append(LayerNorm(channels))
+            self.norms_2.append(LayerNorm(channels))
+    def forward(self, x, x_mask, g=None):
+        if g is not None:
+            x = x + g
+        for i in range(self.n_layers):
+            y = self.convs_sep[i](x * x_mask)
+            y = self.norms_1[i](y)
+            y = F.gelu(y)
+            y = self.convs_1x1[i](y)
+            y = self.norms_2[i](y)
+            y = F.gelu(y)
+            y = self.drop(y)
+            x = x + y
+        return x * x_mask
+class WN(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+        p_dropout=0,
+    ):
+        super(WN, self).__init__()
+        assert kernel_size % 2 == 1
+        self.hidden_channels = hidden_channels
+        self.kernel_size = (kernel_size,)
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        if gin_channels != 0:
+            cond_layer = torch.nn.Conv1d(
+                gin_channels, 2 * hidden_channels * n_layers, 1
+            )
+            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+        for i in range(n_layers):
+            dilation = dilation_rate**i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(
+                hidden_channels,
+                2 * hidden_channels,
+                kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+    def forward(self, x, x_mask, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+        if g is not None:
+            g = self.cond_layer(g)
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+            else:
+                g_l = torch.zeros_like(x_in)
+            acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+            acts = self.drop(acts)
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_channels, :]
+                x = (x + res_acts) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels :, :]
+            else:
+                output = output + res_skip_acts
+        return output * x_mask
+    def remove_weight_norm(self):
+        if self.gin_channels != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for l in self.in_layers:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(l)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Log(nn.Module):
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+            logdet = torch.sum(-y, [1, 2])
+            return y, logdet
+        else:
+            x = torch.exp(x) * x_mask
+            return x
+class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse:
+            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+            return x, logdet
+        else:
+            return x
+class ElementwiseAffine(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.channels = channels
+        self.m = nn.Parameter(torch.zeros(channels, 1))
+        self.logs = nn.Parameter(torch.zeros(channels, 1))
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = self.m + torch.exp(self.logs) * x
+            y = y * x_mask
+            logdet = torch.sum(self.logs * x_mask, [1, 2])
+            return y, logdet
+        else:
+            x = (x - self.m) * torch.exp(-self.logs) * x_mask
+            return x
+class ResidualCouplingLayer(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        p_dropout=0,
+        gin_channels=0,
+        mean_only=False,
+    ):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            p_dropout=p_dropout,
+            gin_channels=gin_channels,
+        )
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask, g=g)
+        stats = self.post(h) * x_mask
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+        if not reverse:
+            x1 = m + x1 * torch.exp(logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+class ConvFlow(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        filter_channels,
+        kernel_size,
+        n_layers,
+        num_bins=10,
+        tail_bound=5.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.num_bins = num_bins
+        self.tail_bound = tail_bound
+        self.half_channels = in_channels // 2
+        self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
+        self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
+        self.proj = nn.Conv1d(
+            filter_channels, self.half_channels * (num_bins * 3 - 1), 1
+        )
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0)
+        h = self.convs(h, x_mask, g=g)
+        h = self.proj(h) * x_mask
+        b, c, t = x0.shape
+        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
+        unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
+            self.filter_channels
+        )
+        unnormalized_derivatives = h[..., 2 * self.num_bins :]
+        x1, logabsdet = piecewise_rational_quadratic_transform(
+            x1,
+            unnormalized_widths,
+            unnormalized_heights,
+            unnormalized_derivatives,
+            inverse=reverse,
+            tails="linear",
+            tail_bound=self.tail_bound,
+        )
+        x = torch.cat([x0, x1], 1) * x_mask
+        logdet = torch.sum(logabsdet * x_mask, [1, 2])
+        if not reverse:
+            return x, logdet
+        else:
+            return x

src/infer_pack/transforms.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import torch
+from torch.nn import functional as F
+import numpy as np
+DEFAULT_MIN_BIN_WIDTH = 1e-3
+DEFAULT_MIN_BIN_HEIGHT = 1e-3
+DEFAULT_MIN_DERIVATIVE = 1e-3
+def piecewise_rational_quadratic_transform(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    tails=None,
+    tail_bound=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    if tails is None:
+        spline_fn = rational_quadratic_spline
+        spline_kwargs = {}
+    else:
+        spline_fn = unconstrained_rational_quadratic_spline
+        spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
+    outputs, logabsdet = spline_fn(
+        inputs=inputs,
+        unnormalized_widths=unnormalized_widths,
+        unnormalized_heights=unnormalized_heights,
+        unnormalized_derivatives=unnormalized_derivatives,
+        inverse=inverse,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+        **spline_kwargs
+    )
+    return outputs, logabsdet
+def searchsorted(bin_locations, inputs, eps=1e-6):
+    bin_locations[..., -1] += eps
+    return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
+def unconstrained_rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    tails="linear",
+    tail_bound=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+    outside_interval_mask = ~inside_interval_mask
+    outputs = torch.zeros_like(inputs)
+    logabsdet = torch.zeros_like(inputs)
+    if tails == "linear":
+        unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
+        constant = np.log(np.exp(1 - min_derivative) - 1)
+        unnormalized_derivatives[..., 0] = constant
+        unnormalized_derivatives[..., -1] = constant
+        outputs[outside_interval_mask] = inputs[outside_interval_mask]
+        logabsdet[outside_interval_mask] = 0
+    else:
+        raise RuntimeError("{} tails are not implemented.".format(tails))
+    (
+        outputs[inside_interval_mask],
+        logabsdet[inside_interval_mask],
+    ) = rational_quadratic_spline(
+        inputs=inputs[inside_interval_mask],
+        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+        inverse=inverse,
+        left=-tail_bound,
+        right=tail_bound,
+        bottom=-tail_bound,
+        top=tail_bound,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+    )
+    return outputs, logabsdet
+def rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    left=0.0,
+    right=1.0,
+    bottom=0.0,
+    top=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    if torch.min(inputs) < left or torch.max(inputs) > right:
+        raise ValueError("Input to a transform is not within its domain")
+    num_bins = unnormalized_widths.shape[-1]
+    if min_bin_width * num_bins > 1.0:
+        raise ValueError("Minimal bin width too large for the number of bins")
+    if min_bin_height * num_bins > 1.0:
+        raise ValueError("Minimal bin height too large for the number of bins")
+    widths = F.softmax(unnormalized_widths, dim=-1)
+    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+    cumwidths = torch.cumsum(widths, dim=-1)
+    cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
+    cumwidths = (right - left) * cumwidths + left
+    cumwidths[..., 0] = left
+    cumwidths[..., -1] = right
+    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+    derivatives = min_derivative + F.softplus(unnormalized_derivatives)
+    heights = F.softmax(unnormalized_heights, dim=-1)
+    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+    cumheights = torch.cumsum(heights, dim=-1)
+    cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
+    cumheights = (top - bottom) * cumheights + bottom
+    cumheights[..., 0] = bottom
+    cumheights[..., -1] = top
+    heights = cumheights[..., 1:] - cumheights[..., :-1]
+    if inverse:
+        bin_idx = searchsorted(cumheights, inputs)[..., None]
+    else:
+        bin_idx = searchsorted(cumwidths, inputs)[..., None]
+    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+    delta = heights / widths
+    input_delta = delta.gather(-1, bin_idx)[..., 0]
+    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+    input_heights = heights.gather(-1, bin_idx)[..., 0]
+    if inverse:
+        a = (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        ) + input_heights * (input_delta - input_derivatives)
+        b = input_heights * input_derivatives - (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        )
+        c = -input_delta * (inputs - input_cumheights)
+        discriminant = b.pow(2) - 4 * a * c
+        assert (discriminant >= 0).all()
+        root = (2 * c) / (-b - torch.sqrt(discriminant))
+        outputs = root * input_bin_widths + input_cumwidths
+        theta_one_minus_theta = root * (1 - root)
+        denominator = input_delta + (
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+            * theta_one_minus_theta
+        )
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * root.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - root).pow(2)
+        )
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+        return outputs, -logabsdet
+    else:
+        theta = (inputs - input_cumwidths) / input_bin_widths
+        theta_one_minus_theta = theta * (1 - theta)
+        numerator = input_heights * (
+            input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
+        )
+        denominator = input_delta + (
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+            * theta_one_minus_theta
+        )
+        outputs = input_cumheights + numerator / denominator
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * theta.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - theta).pow(2)
+        )
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+        return outputs, logabsdet

src/main.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import argparse
+import gc
+import hashlib
+import json
+import os
+import shlex
+import subprocess
+from contextlib import suppress
+from urllib.parse import urlparse, parse_qs
+import gradio as gr
+import librosa
+import numpy as np
+import soundfile as sf
+import sox
+import yt_dlp
+from pedalboard import Pedalboard, Reverb, Compressor, HighpassFilter
+from pedalboard.io import AudioFile
+from pydub import AudioSegment
+from src.mdx import run_mdx, run_roformer
+from src.rvc import Config, load_hubert, get_vc, rvc_infer
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+mdxnet_models_dir = os.path.join(BASE_DIR, 'mdxnet_models')
+rvc_models_dir = os.path.join(BASE_DIR, 'rvc_models')
+output_dir = os.path.join(BASE_DIR, 'song_output')
+def get_youtube_video_id(url, ignore_playlist=True):
+    """
+    Examples:
+    http://youtu.be/SA2iWivDJiE
+    http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu
+    http://www.youtube.com/embed/SA2iWivDJiE
+    http://www.youtube.com/v/SA2iWivDJiE?version=3&amp;hl=en_US
+    """
+    query = urlparse(url)
+    if query.hostname == 'youtu.be':
+        if query.path[1:] == 'watch':
+            return query.query[2:]
+        return query.path[1:]
+    if query.hostname in {'www.youtube.com', 'youtube.com', 'music.youtube.com'}:
+        if not ignore_playlist:
+            # use case: get playlist id not current video in playlist
+            with suppress(KeyError):
+                return parse_qs(query.query)['list'][0]
+        if query.path == '/watch':
+            return parse_qs(query.query)['v'][0]
+        if query.path[:7] == '/watch/':
+            return query.path.split('/')[1]
+        if query.path[:7] == '/embed/':
+            return query.path.split('/')[2]
+        if query.path[:3] == '/v/':
+            return query.path.split('/')[2]
+    # returns None for invalid YouTube url
+    return None
+def yt_download(link):
+    ydl_opts = {
+        'format': 'bestaudio',
+        'outtmpl': '%(title)s',
+        'nocheckcertificate': True,
+        'ignoreerrors': True,
+        'no_warnings': True,
+        'quiet': True,
+        'extractaudio': True,
+        'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3'}],
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        result = ydl.extract_info(link, download=True)
+        download_path = ydl.prepare_filename(result, outtmpl='%(title)s.mp3')
+    return download_path
+def raise_exception(error_msg, is_webui):
+    if is_webui:
+        raise gr.Error(error_msg)
+    else:
+        raise Exception(error_msg)
+def get_rvc_model(voice_model, is_webui):
+    rvc_model_filename, rvc_index_filename = None, None
+    model_dir = os.path.join(rvc_models_dir, voice_model)
+    for file in os.listdir(model_dir):
+        ext = os.path.splitext(file)[1]
+        if ext == '.pth':
+            rvc_model_filename = file
+        if ext == '.index':
+            rvc_index_filename = file
+    if rvc_model_filename is None:
+        error_msg = f'No model file exists in {model_dir}.'
+        raise_exception(error_msg, is_webui)
+    return os.path.join(model_dir, rvc_model_filename), os.path.join(model_dir, rvc_index_filename) if rvc_index_filename else ''
+def get_audio_paths(song_dir):
+    orig_song_path = None
+    instrumentals_path = None
+    main_vocals_dereverb_path = None
+    backup_vocals_path = None
+    for file in os.listdir(song_dir):
+        if file.endswith('_Instrumental.wav'):
+            instrumentals_path = os.path.join(song_dir, file)
+            orig_song_path = instrumentals_path.replace('_Instrumental', '')
+        elif file.endswith('_Vocals_Main_DeReverb.wav'):
+            main_vocals_dereverb_path = os.path.join(song_dir, file)
+        elif file.endswith('_Vocals_Backup.wav'):
+            backup_vocals_path = os.path.join(song_dir, file)
+    return orig_song_path, instrumentals_path, main_vocals_dereverb_path, backup_vocals_path
+def convert_to_stereo(audio_path):
+    wave, sr = librosa.load(audio_path, mono=False, sr=44100)
+    # check if mono
+    if type(wave[0]) != np.ndarray:
+        stereo_path = f'{os.path.splitext(audio_path)[0]}_stereo.wav'
+        command = shlex.split(f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 2 -f wav "{stereo_path}"')
+        subprocess.run(command)
+        return stereo_path
+    else:
+        return audio_path
+def pitch_shift(audio_path, pitch_change):
+    output_path = f'{os.path.splitext(audio_path)[0]}_p{pitch_change}.wav'
+    if not os.path.exists(output_path):
+        y, sr = sf.read(audio_path)
+        tfm = sox.Transformer()
+        tfm.pitch(pitch_change)
+        y_shifted = tfm.build_array(input_array=y, sample_rate_in=sr)
+        sf.write(output_path, y_shifted, sr)
+    return output_path
+def get_hash(filepath):
+    with open(filepath, 'rb') as f:
+        file_hash = hashlib.blake2b()
+        while chunk := f.read(8192):
+            file_hash.update(chunk)
+    return file_hash.hexdigest()[:11]
+def display_progress(message, percent, is_webui, progress=None):
+    if is_webui:
+        progress(percent, desc=message)
+    else:
+        print(message)
+def preprocess_song(song_input, mdx_model_params, song_id, is_webui, input_type, progress=None):
+    keep_orig = False
+    if input_type == 'yt':
+        display_progress('[~] Downloading song...', 0, is_webui, progress)
+        song_link = song_input.split('&')[0]
+        orig_song_path = yt_download(song_link)
+    elif input_type == 'local':
+        orig_song_path = song_input
+        keep_orig = True
+    else:
+        orig_song_path = None
+    song_output_dir = os.path.join(output_dir, song_id)
+    orig_song_path = convert_to_stereo(orig_song_path)
+    display_progress('[~] Separating Vocals from Instrumental...', 0.1, is_webui, progress)
+    vocals_path, instrumentals_path = run_roformer(mdx_model_params, song_output_dir, 'model_bs_roformer_ep_317_sdr_12.9755.ckpt', orig_song_path, denoise=True, keep_orig=keep_orig)
+    display_progress('[~] Separating Main Vocals from Backup Vocals...', 0.2, is_webui, progress)
+    backup_vocals_path, main_vocals_path = run_mdx(mdx_model_params, song_output_dir, os.path.join(mdxnet_models_dir, 'UVR_MDXNET_KARA_2.onnx'), vocals_path, suffix='Backup', invert_suffix='Main', denoise=True)
+    display_progress('[~] Applying DeReverb to Vocals...', 0.3, is_webui, progress)
+    _, main_vocals_dereverb_path = run_mdx(mdx_model_params, song_output_dir, os.path.join(mdxnet_models_dir, 'Reverb_HQ_By_FoxJoy.onnx'), main_vocals_path, invert_suffix='DeReverb', exclude_main=True, denoise=True)
+    return orig_song_path, vocals_path, instrumentals_path, main_vocals_path, backup_vocals_path, main_vocals_dereverb_path
+def voice_change(voice_model, vocals_path, output_path, pitch_change, f0_method, index_rate, filter_radius, rms_mix_rate, protect, crepe_hop_length, is_webui):
+    rvc_model_path, rvc_index_path = get_rvc_model(voice_model, is_webui)
+    device = 'cuda:0'
+    config = Config(device, True)
+    hubert_model = load_hubert(device, config.is_half, os.path.join(rvc_models_dir, 'hubert_base.pt'))
+    cpt, version, net_g, tgt_sr, vc = get_vc(device, config.is_half, config, rvc_model_path)
+    # convert main vocals
+    rvc_infer(rvc_index_path, index_rate, vocals_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model)
+    del hubert_model, cpt
+    gc.collect()
+def add_audio_effects(audio_path, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping):
+    output_path = f'{os.path.splitext(audio_path)[0]}_mixed.wav'
+    # Initialize audio effects plugins
+    board = Pedalboard(
+        [
+            HighpassFilter(),
+            Compressor(ratio=4, threshold_db=-15),
+            Reverb(room_size=reverb_rm_size, dry_level=reverb_dry, wet_level=reverb_wet, damping=reverb_damping)
+         ]
+    )
+    with AudioFile(audio_path) as f:
+        with AudioFile(output_path, 'w', f.samplerate, f.num_channels) as o:
+            # Read one second of audio at a time, until the file is empty:
+            while f.tell() < f.frames:
+                chunk = f.read(int(f.samplerate))
+                effected = board(chunk, f.samplerate, reset=False)
+                o.write(effected)
+    return output_path
+def combine_audio(audio_paths, output_path, main_gain, backup_gain, inst_gain, output_format):
+    main_vocal_audio = AudioSegment.from_wav(audio_paths[0]) - 4 + main_gain
+    backup_vocal_audio = AudioSegment.from_wav(audio_paths[1]) - 6 + backup_gain
+    instrumental_audio = AudioSegment.from_wav(audio_paths[2]) - 7 + inst_gain
+    main_vocal_audio.overlay(backup_vocal_audio).overlay(instrumental_audio).export(output_path, format=output_format)
+def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,
+                        is_webui=0, main_gain=0, backup_gain=0, inst_gain=0, index_rate=0.5, filter_radius=3,
+                        rms_mix_rate=0.25, f0_method='rmvpe', crepe_hop_length=128, protect=0.33, pitch_change_all=0,
+                        reverb_rm_size=0.15, reverb_wet=0.2, reverb_dry=0.8, reverb_damping=0.7, output_format='mp3',
+                        progress=gr.Progress()):
+    try:
+        if not song_input or not voice_model:
+            raise_exception('Ensure that the song input field and voice model field is filled.', is_webui)
+        display_progress('[~] Starting AI Cover Generation Pipeline...', 0, is_webui, progress)
+        with open(os.path.join(mdxnet_models_dir, 'model_data.json')) as infile:
+            mdx_model_params = json.load(infile)
+        # if youtube url
+        if urlparse(song_input).scheme == 'https':
+            input_type = 'yt'
+            song_id = get_youtube_video_id(song_input)
+            if song_id is None:
+                error_msg = 'Invalid YouTube url.'
+                raise_exception(error_msg, is_webui)
+        # local audio file
+        else:
+            input_type = 'local'
+            song_input = song_input.strip('\"')
+            if os.path.exists(song_input):
+                song_id = get_hash(song_input)
+            else:
+                error_msg = f'{song_input} does not exist.'
+                song_id = None
+                raise_exception(error_msg, is_webui)
+        song_dir = os.path.join(output_dir, song_id)
+        if not os.path.exists(song_dir):
+            os.makedirs(song_dir)
+            orig_song_path, vocals_path, instrumentals_path, main_vocals_path, backup_vocals_path, main_vocals_dereverb_path = preprocess_song(song_input, mdx_model_params, song_id, is_webui, input_type, progress)
+        else:
+            vocals_path, main_vocals_path = None, None
+            paths = get_audio_paths(song_dir)
+            # if any of the audio files aren't available or keep intermediate files, rerun preprocess
+            if any(path is None for path in paths) or keep_files:
+                orig_song_path, vocals_path, instrumentals_path, main_vocals_path, backup_vocals_path, main_vocals_dereverb_path = preprocess_song(song_input, mdx_model_params, song_id, is_webui, input_type, progress)
+            else:
+                orig_song_path, instrumentals_path, main_vocals_dereverb_path, backup_vocals_path = paths
+        pitch_change = pitch_change + pitch_change_all
+        ai_vocals_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]}_lead_{voice_model}_p{pitch_change}_i{index_rate}_fr{filter_radius}_rms{rms_mix_rate}_pro{protect}_{f0_method}{"" if f0_method != "mangio-crepe" else f"_{crepe_hop_length}"}.wav')
+        ai_backing_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]}_backing_{voice_model}_p{pitch_change}_i{index_rate}_fr{filter_radius}_rms{rms_mix_rate}_pro{protect}_{f0_method}{"" if f0_method != "mangio-crepe" else f"_{crepe_hop_length}"}.wav')
+        ai_cover_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]} ({voice_model} Ver).{output_format}')
+        ai_cover_backing_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]} ({voice_model} Ver With Backing).{output_format}')
+        if not os.path.exists(ai_vocals_path):
+            display_progress('[~] Converting lead voice using RVC...', 0.5, is_webui, progress)
+            voice_change(voice_model, main_vocals_dereverb_path, ai_vocals_path, pitch_change, f0_method, index_rate, filter_radius, rms_mix_rate, protect, crepe_hop_length, is_webui)
+            display_progress('[~] Converting backing voice using RVC...', 0.65, is_webui, progress)
+            voice_change(voice_model, backup_vocals_path, ai_backing_path, pitch_change, f0_method, index_rate, filter_radius, rms_mix_rate, protect, crepe_hop_length, is_webui)
+        display_progress('[~] Applying audio effects to Vocals...', 0.8, is_webui, progress)
+        ai_vocals_mixed_path = add_audio_effects(ai_vocals_path, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping)
+        ai_backing_mixed_path = add_audio_effects(ai_backing_path, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping)
+        if pitch_change_all != 0:
+            display_progress('[~] Applying overall pitch change', 0.85, is_webui, progress)
+            instrumentals_path = pitch_shift(instrumentals_path, pitch_change_all)
+            backup_vocals_path = pitch_shift(backup_vocals_path, pitch_change_all)
+        display_progress('[~] Combining AI Vocals and Instrumentals...', 0.9, is_webui, progress)
+        combine_audio([ai_vocals_mixed_path, backup_vocals_path, instrumentals_path], ai_cover_path, main_gain, backup_gain, inst_gain, output_format)
+        combine_audio([ai_vocals_mixed_path, ai_backing_mixed_path, instrumentals_path], ai_cover_backing_path, main_gain, backup_gain, inst_gain, output_format)
+        if not keep_files:
+            display_progress('[~] Removing intermediate audio files...', 0.95, is_webui, progress)
+            intermediate_files = [vocals_path, main_vocals_path, ai_vocals_mixed_path, ai_backing_mixed_path]
+            if pitch_change_all != 0:
+                intermediate_files += [instrumentals_path, backup_vocals_path]
+            for file in intermediate_files:
+                if file and os.path.exists(file):
+                    os.remove(file)
+        return ai_cover_path, ai_cover_backing_path
+    except Exception as e:
+        raise_exception(str(e), is_webui)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate a AI cover song in the song_output/id directory.', add_help=True)
+    parser.add_argument('-i', '--song-input', type=str, required=True, help='Link to a YouTube video or the filepath to a local mp3/wav file to create an AI cover of')
+    parser.add_argument('-dir', '--rvc-dirname', type=str, required=True, help='Name of the folder in the rvc_models directory containing the RVC model file and optional index file to use')
+    parser.add_argument('-p', '--pitch-change', type=int, required=True, help='Change the pitch of AI Vocals only. Generally, use 1 for male to female and -1 for vice-versa. (Octaves)')
+    parser.add_argument('-k', '--keep-files', action=argparse.BooleanOptionalAction, help='Whether to keep all intermediate audio files generated in the song_output/id directory, e.g. Isolated Vocals/Instrumentals')
+    parser.add_argument('-ir', '--index-rate', type=float, default=0.5, help='A decimal number e.g. 0.5, used to reduce/resolve the timbre leakage problem. If set to 1, more biased towards the timbre quality of the training dataset')
+    parser.add_argument('-fr', '--filter-radius', type=int, default=3, help='A number between 0 and 7. If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness.')
+    parser.add_argument('-rms', '--rms-mix-rate', type=float, default=0.25, help="A decimal number e.g. 0.25. Control how much to use the original vocal's loudness (0) or a fixed loudness (1).")
+    parser.add_argument('-palgo', '--pitch-detection-algo', type=str, default='rmvpe', help='Best option is rmvpe (clarity in vocals), then mangio-crepe (smoother vocals).')
+    parser.add_argument('-hop', '--crepe-hop-length', type=int, default=128, help='If pitch detection algo is mangio-crepe, controls how often it checks for pitch changes in milliseconds. The higher the value, the faster the conversion and less risk of voice cracks, but there is less pitch accuracy. Recommended: 128.')
+    parser.add_argument('-pro', '--protect', type=float, default=0.33, help='A decimal number e.g. 0.33. Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy.')
+    parser.add_argument('-mv', '--main-vol', type=int, default=0, help='Volume change for AI main vocals in decibels. Use -3 to decrease by 3 decibels and 3 to increase by 3 decibels')
+    parser.add_argument('-bv', '--backup-vol', type=int, default=0, help='Volume change for backup vocals in decibels')
+    parser.add_argument('-iv', '--inst-vol', type=int, default=0, help='Volume change for instrumentals in decibels')
+    parser.add_argument('-pall', '--pitch-change-all', type=int, default=0, help='Change the pitch/key of vocals and instrumentals. Changing this slightly reduces sound quality')
+    parser.add_argument('-rsize', '--reverb-size', type=float, default=0.15, help='Reverb room size between 0 and 1')
+    parser.add_argument('-rwet', '--reverb-wetness', type=float, default=0.2, help='Reverb wet level between 0 and 1')
+    parser.add_argument('-rdry', '--reverb-dryness', type=float, default=0.8, help='Reverb dry level between 0 and 1')
+    parser.add_argument('-rdamp', '--reverb-damping', type=float, default=0.7, help='Reverb damping between 0 and 1')
+    parser.add_argument('-oformat', '--output-format', type=str, default='mp3', help='Output format of audio file. mp3 for smaller file size, wav for best quality')
+    args = parser.parse_args()
+    rvc_dirname = args.rvc_dirname
+    if not os.path.exists(os.path.join(rvc_models_dir, rvc_dirname)):
+        raise Exception(f'The folder {os.path.join(rvc_models_dir, rvc_dirname)} does not exist.')
+    cover_path = song_cover_pipeline(args.song_input, rvc_dirname, args.pitch_change, args.keep_files,
+                                     main_gain=args.main_vol, backup_gain=args.backup_vol, inst_gain=args.inst_vol,
+                                     index_rate=args.index_rate, filter_radius=args.filter_radius,
+                                     rms_mix_rate=args.rms_mix_rate, f0_method=args.pitch_detection_algo,
+                                     crepe_hop_length=args.crepe_hop_length, protect=args.protect,
+                                     pitch_change_all=args.pitch_change_all,
+                                     reverb_rm_size=args.reverb_size, reverb_wet=args.reverb_wetness,
+                                     reverb_dry=args.reverb_dryness, reverb_damping=args.reverb_damping,
+                                     output_format=args.output_format)
+    print(f'[+] Cover generated at {cover_path}')

src/mdx.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import gc
+import hashlib
+import os
+import queue
+import threading
+import warnings
+import librosa
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+import torch
+from tqdm import tqdm
+import re
+import random
+warnings.filterwarnings("ignore")
+stem_naming = {'Vocals': 'Instrumental', 'Other': 'Instruments', 'Instrumental': 'Vocals', 'Drums': 'Drumless', 'Bass': 'Bassless'}
+class MDXModel:
+    def __init__(self, device, dim_f, dim_t, n_fft, hop=1024, stem_name=None, compensation=1.000):
+        self.dim_f = dim_f
+        self.dim_t = dim_t
+        self.dim_c = 4
+        self.n_fft = n_fft
+        self.hop = hop
+        self.stem_name = stem_name
+        self.compensation = compensation
+        self.n_bins = self.n_fft // 2 + 1
+        self.chunk_size = hop * (self.dim_t - 1)
+        self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(device)
+        out_c = self.dim_c
+        self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t]).to(device)
+    def stft(self, x):
+        x = x.reshape([-1, self.chunk_size])
+        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True, return_complex=True)
+        x = torch.view_as_real(x)
+        x = x.permute([0, 3, 1, 2])
+        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, 4, self.n_bins, self.dim_t])
+        return x[:, :, :self.dim_f]
+    def istft(self, x, freq_pad=None):
+        freq_pad = self.freq_pad.repeat([x.shape[0], 1, 1, 1]) if freq_pad is None else freq_pad
+        x = torch.cat([x, freq_pad], -2)
+        # c = 4*2 if self.target_name=='*' else 2
+        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, 2, self.n_bins, self.dim_t])
+        x = x.permute([0, 2, 3, 1])
+        x = x.contiguous()
+        x = torch.view_as_complex(x)
+        x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True)
+        return x.reshape([-1, 2, self.chunk_size])
+class MDX:
+    DEFAULT_SR = 44100
+    # Unit: seconds
+    DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
+    DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
+    DEFAULT_PROCESSOR = 0
+    def __init__(self, model_path: str, params: MDXModel, processor=DEFAULT_PROCESSOR):
+        # Set the device and the provider (CPU or CUDA)
+        self.device = torch.device(f'cuda:{processor}') if processor >= 0 else torch.device('cpu')
+        self.provider = ['CUDAExecutionProvider'] if processor >= 0 else ['CPUExecutionProvider']
+        self.model = params
+        # Load the ONNX model using ONNX Runtime
+        self.ort = ort.InferenceSession(model_path, providers=self.provider)
+        # Preload the model for faster performance
+        self.ort.run(None, {'input': torch.rand(1, 4, params.dim_f, params.dim_t).numpy()})
+        self.process = lambda spec: self.ort.run(None, {'input': spec.cpu().numpy()})[0]
+        self.prog = None
+    @staticmethod
+    def get_hash(model_path):
+        try:
+            with open(model_path, 'rb') as f:
+                f.seek(- 10000 * 1024, 2)
+                model_hash = hashlib.md5(f.read()).hexdigest()
+        except:
+            model_hash = hashlib.md5(open(model_path, 'rb').read()).hexdigest()
+        return model_hash
+    @staticmethod
+    def segment(wave, combine=True, chunk_size=DEFAULT_CHUNK_SIZE, margin_size=DEFAULT_MARGIN_SIZE):
+        """
+        Segment or join segmented wave array
+        Args:
+            wave: (np.array) Wave array to be segmented or joined
+            combine: (bool) If True, combines segmented wave array. If False, segments wave array.
+            chunk_size: (int) Size of each segment (in samples)
+            margin_size: (int) Size of margin between segments (in samples)
+        Returns:
+            numpy array: Segmented or joined wave array
+        """
+        if combine:
+            processed_wave = None  # Initializing as None instead of [] for later numpy array concatenation
+            for segment_count, segment in enumerate(wave):
+                start = 0 if segment_count == 0 else margin_size
+                end = None if segment_count == len(wave) - 1 else -margin_size
+                if margin_size == 0:
+                    end = None
+                if processed_wave is None:  # Create array for first segment
+                    processed_wave = segment[:, start:end]
+                else:  # Concatenate to existing array for subsequent segments
+                    processed_wave = np.concatenate((processed_wave, segment[:, start:end]), axis=-1)
+        else:
+            processed_wave = []
+            sample_count = wave.shape[-1]
+            if chunk_size <= 0 or chunk_size > sample_count:
+                chunk_size = sample_count
+            if margin_size > chunk_size:
+                margin_size = chunk_size
+            for segment_count, skip in enumerate(range(0, sample_count, chunk_size)):
+                margin = 0 if segment_count == 0 else margin_size
+                end = min(skip + chunk_size + margin_size, sample_count)
+                start = skip - margin
+                cut = wave[:, start:end].copy()
+                processed_wave.append(cut)
+                if end == sample_count:
+                    break
+        return processed_wave
+    def pad_wave(self, wave):
+        """
+        Pad the wave array to match the required chunk size
+        Args:
+            wave: (np.array) Wave array to be padded
+        Returns:
+            tuple: (padded_wave, pad, trim)
+                - padded_wave: Padded wave array
+                - pad: Number of samples that were padded
+                - trim: Number of samples that were trimmed
+        """
+        n_sample = wave.shape[1]
+        trim = self.model.n_fft // 2
+        gen_size = self.model.chunk_size - 2 * trim
+        pad = gen_size - n_sample % gen_size
+        # Padded wave
+        wave_p = np.concatenate((np.zeros((2, trim)), wave, np.zeros((2, pad)), np.zeros((2, trim))), 1)
+        mix_waves = []
+        for i in range(0, n_sample + pad, gen_size):
+            waves = np.array(wave_p[:, i:i + self.model.chunk_size])
+            mix_waves.append(waves)
+        mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(self.device)
+        return mix_waves, pad, trim
+    def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):
+        """
+        Process each wave segment in a multi-threaded environment
+        Args:
+            mix_waves: (torch.Tensor) Wave segments to be processed
+            trim: (int) Number of samples trimmed during padding
+            pad: (int) Number of samples padded during padding
+            q: (queue.Queue) Queue to hold the processed wave segments
+            _id: (int) Identifier of the processed wave segment
+        Returns:
+            numpy array: Processed wave segment
+        """
+        mix_waves = mix_waves.split(1)
+        with torch.no_grad():
+            pw = []
+            for mix_wave in mix_waves:
+                self.prog.update()
+                spec = self.model.stft(mix_wave)
+                processed_spec = torch.tensor(self.process(spec))
+                processed_wav = self.model.istft(processed_spec.to(self.device))
+                processed_wav = processed_wav[:, :, trim:-trim].transpose(0, 1).reshape(2, -1).cpu().numpy()
+                pw.append(processed_wav)
+        processed_signal = np.concatenate(pw, axis=-1)[:, :-pad]
+        q.put({_id: processed_signal})
+        return processed_signal
+    def process_wave(self, wave: np.array, mt_threads=1):
+        """
+        Process the wave array in a multi-threaded environment
+        Args:
+            wave: (np.array) Wave array to be processed
+            mt_threads: (int) Number of threads to be used for processing
+        Returns:
+            numpy array: Processed wave array
+        """
+        self.prog = tqdm(total=0)
+        chunk = wave.shape[-1] // mt_threads
+        waves = self.segment(wave, False, chunk)
+        # Create a queue to hold the processed wave segments
+        q = queue.Queue()
+        threads = []
+        for c, batch in enumerate(waves):
+            mix_waves, pad, trim = self.pad_wave(batch)
+            self.prog.total = len(mix_waves) * mt_threads
+            thread = threading.Thread(target=self._process_wave, args=(mix_waves, trim, pad, q, c))
+            thread.start()
+            threads.append(thread)
+        for thread in threads:
+            thread.join()
+        self.prog.close()
+        processed_batches = []
+        while not q.empty():
+            processed_batches.append(q.get())
+        processed_batches = [list(wave.values())[0] for wave in
+                             sorted(processed_batches, key=lambda d: list(d.keys())[0])]
+        assert len(processed_batches) == len(waves), 'Incomplete processed batches, please reduce batch size!'
+        return self.segment(processed_batches, True, chunk)
+def run_mdx(model_params, output_dir, model_path, filename, exclude_main=False, exclude_inversion=False, suffix=None, invert_suffix=None, denoise=False, keep_orig=True, m_threads=2):
+    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+    device_properties = torch.cuda.get_device_properties(device)
+    vram_gb = device_properties.total_memory / 1024**3
+    m_threads = 1 if vram_gb < 8 else 2
+    model_hash = MDX.get_hash(model_path)
+    mp = model_params.get(model_hash)
+    model = MDXModel(
+        device,
+        dim_f=mp["mdx_dim_f_set"],
+        dim_t=2 ** mp["mdx_dim_t_set"],
+        n_fft=mp["mdx_n_fft_scale_set"],
+        stem_name=mp["primary_stem"],
+        compensation=mp["compensate"]
+    )
+    mdx_sess = MDX(model_path, model)
+    wave, sr = librosa.load(filename, mono=False, sr=44100)
+    # normalizing input wave gives better output
+    peak = max(np.max(wave), abs(np.min(wave)))
+    wave /= peak
+    if denoise:
+        wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (mdx_sess.process_wave(wave, m_threads))
+        wave_processed *= 0.5
+    else:
+        wave_processed = mdx_sess.process_wave(wave, m_threads)
+    # return to previous peak
+    wave_processed *= peak
+    stem_name = model.stem_name if suffix is None else suffix
+    main_filepath = None
+    if not exclude_main:
+        main_filepath = os.path.join(output_dir, f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav")
+        sf.write(main_filepath, wave_processed.T, sr)
+    invert_filepath = None
+    if not exclude_inversion:
+        diff_stem_name = stem_naming.get(stem_name) if invert_suffix is None else invert_suffix
+        stem_name = f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
+        invert_filepath = os.path.join(output_dir, f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav")
+        sf.write(invert_filepath, (-wave_processed.T * model.compensation) + wave.T, sr)
+    if not keep_orig:
+        os.remove(filename)
+    del mdx_sess, wave_processed, wave
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gc.collect()
+    return main_filepath, invert_filepath
+def run_roformer(model_params, output_dir, model_name, filename, exclude_main=False, exclude_inversion=False, suffix=None, invert_suffix=None, denoise=False, keep_orig=True, m_threads=2):
+    os.makedirs(output_dir, exist_ok=True)
+    # Load and process the audio
+    wave, sr = librosa.load(filename, mono=False, sr=44100)
+    base_name = os.path.splitext(os.path.basename(filename))[0]
+    roformer_output_format = 'wav'
+    roformer_overlap = 4
+    roformer_segment_size = 256
+    print(f"output_dir: {output_dir}")
+    prompt = f'audio-separator "{filename}" --model_filename {model_name} --output_dir="{output_dir}" --output_format={roformer_output_format} --normalization=0.9 --mdxc_overlap={roformer_overlap} --mdxc_segment_size={roformer_segment_size}'
+    os.system(prompt)
+    vocals_file = f"{base_name}_Vocals.wav"
+    instrumental_file = f"{base_name}_Instrumental.wav"
+    main_filepath = None
+    invert_filepath = None
+    if not exclude_main:
+        main_filepath = os.path.join(output_dir, vocals_file)
+        if os.path.exists(os.path.join(output_dir, f"{base_name}_(Vocals)_{model_name.replace('.9755.ckpt', '')}.wav")):
+            os.rename(os.path.join(output_dir, f"{base_name}_(Vocals)_{model_name.replace('.9755.ckpt', '')}.wav"), main_filepath)
+    if not exclude_inversion:
+        invert_filepath = os.path.join(output_dir, instrumental_file)
+        if os.path.exists(os.path.join(output_dir, f"{base_name}_(Instrumental)_{model_name.replace('.9755.ckpt', '')}.wav")):
+            os.rename(os.path.join(output_dir, f"{base_name}_(Instrumental)_{model_name.replace('.9755.ckpt', '')}.wav"), invert_filepath)
+    if not keep_orig:
+        os.remove(filename)
+    return main_filepath, invert_filepath

src/my_utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import ffmpeg
+import numpy as np
+def load_audio(file, sr):
+    try:
+        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        file = (
+            file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        )  # 防止小白拷路径头尾带了空格和"和回车
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to load audio: {e}")
+    return np.frombuffer(out, np.float32).flatten()

src/rmvpe.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from librosa.filters import mel
+class BiGRU(nn.Module):
+    def __init__(self, input_features, hidden_features, num_layers):
+        super(BiGRU, self).__init__()
+        self.gru = nn.GRU(
+            input_features,
+            hidden_features,
+            num_layers=num_layers,
+            batch_first=True,
+            bidirectional=True,
+        )
+    def forward(self, x):
+        return self.gru(x)[0]
+class ConvBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, momentum=0.01):
+        super(ConvBlockRes, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+            nn.Conv2d(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+    def forward(self, x):
+        if self.is_shortcut:
+            return self.conv(x) + self.shortcut(x)
+        else:
+            return self.conv(x) + x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        in_size,
+        n_encoders,
+        kernel_size,
+        n_blocks,
+        out_channels=16,
+        momentum=0.01,
+    ):
+        super(Encoder, self).__init__()
+        self.n_encoders = n_encoders
+        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
+        self.layers = nn.ModuleList()
+        self.latent_channels = []
+        for i in range(self.n_encoders):
+            self.layers.append(
+                ResEncoderBlock(
+                    in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
+                )
+            )
+            self.latent_channels.append([out_channels, in_size])
+            in_channels = out_channels
+            out_channels *= 2
+            in_size //= 2
+        self.out_size = in_size
+        self.out_channel = out_channels
+    def forward(self, x):
+        concat_tensors = []
+        x = self.bn(x)
+        for i in range(self.n_encoders):
+            _, x = self.layers[i](x)
+            concat_tensors.append(_)
+        return x, concat_tensors
+class ResEncoderBlock(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
+    ):
+        super(ResEncoderBlock, self).__init__()
+        self.n_blocks = n_blocks
+        self.conv = nn.ModuleList()
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+        self.kernel_size = kernel_size
+        if self.kernel_size is not None:
+            self.pool = nn.AvgPool2d(kernel_size=kernel_size)
+    def forward(self, x):
+        for i in range(self.n_blocks):
+            x = self.conv[i](x)
+        if self.kernel_size is not None:
+            return x, self.pool(x)
+        else:
+            return x
+class Intermediate(nn.Module):  #
+    def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
+        super(Intermediate, self).__init__()
+        self.n_inters = n_inters
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
+        )
+        for i in range(self.n_inters - 1):
+            self.layers.append(
+                ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
+            )
+    def forward(self, x):
+        for i in range(self.n_inters):
+            x = self.layers[i](x)
+        return x
+class ResDecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
+        super(ResDecoderBlock, self).__init__()
+        out_padding = (0, 1) if stride == (1, 2) else (1, 1)
+        self.n_blocks = n_blocks
+        self.conv1 = nn.Sequential(
+            nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=stride,
+                padding=(1, 1),
+                output_padding=out_padding,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        self.conv2 = nn.ModuleList()
+        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x, concat_tensor):
+        x = self.conv1(x)
+        x = torch.cat((x, concat_tensor), dim=1)
+        for i in range(self.n_blocks):
+            x = self.conv2[i](x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList()
+        self.n_decoders = n_decoders
+        for i in range(self.n_decoders):
+            out_channels = in_channels // 2
+            self.layers.append(
+                ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
+            )
+            in_channels = out_channels
+    def forward(self, x, concat_tensors):
+        for i in range(self.n_decoders):
+            x = self.layers[i](x, concat_tensors[-1 - i])
+        return x
+class DeepUnet(nn.Module):
+    def __init__(
+        self,
+        kernel_size,
+        n_blocks,
+        en_de_layers=5,
+        inter_layers=4,
+        in_channels=1,
+        en_out_channels=16,
+    ):
+        super(DeepUnet, self).__init__()
+        self.encoder = Encoder(
+            in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
+        )
+        self.intermediate = Intermediate(
+            self.encoder.out_channel // 2,
+            self.encoder.out_channel,
+            inter_layers,
+            n_blocks,
+        )
+        self.decoder = Decoder(
+            self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
+        )
+    def forward(self, x):
+        x, concat_tensors = self.encoder(x)
+        x = self.intermediate(x)
+        x = self.decoder(x, concat_tensors)
+        return x
+class E2E(nn.Module):
+    def __init__(
+        self,
+        n_blocks,
+        n_gru,
+        kernel_size,
+        en_de_layers=5,
+        inter_layers=4,
+        in_channels=1,
+        en_out_channels=16,
+    ):
+        super(E2E, self).__init__()
+        self.unet = DeepUnet(
+            kernel_size,
+            n_blocks,
+            en_de_layers,
+            inter_layers,
+            in_channels,
+            en_out_channels,
+        )
+        self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
+        if n_gru:
+            self.fc = nn.Sequential(
+                BiGRU(3 * 128, 256, n_gru),
+                nn.Linear(512, 360),
+                nn.Dropout(0.25),
+                nn.Sigmoid(),
+            )
+        else:
+            self.fc = nn.Sequential(
+                nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
+            )
+    def forward(self, mel):
+        mel = mel.transpose(-1, -2).unsqueeze(1)
+        x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
+        x = self.fc(x)
+        return x
+class MelSpectrogram(torch.nn.Module):
+    def __init__(
+        self,
+        is_half,
+        n_mel_channels,
+        sampling_rate,
+        win_length,
+        hop_length,
+        n_fft=None,
+        mel_fmin=0,
+        mel_fmax=None,
+        clamp=1e-5,
+    ):
+        super().__init__()
+        n_fft = win_length if n_fft is None else n_fft
+        self.hann_window = {}
+        mel_basis = mel(
+            sr=sampling_rate,
+            n_fft=n_fft,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
+            htk=True,
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+        self.n_fft = win_length if n_fft is None else n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sampling_rate = sampling_rate
+        self.n_mel_channels = n_mel_channels
+        self.clamp = clamp
+        self.is_half = is_half
+    def forward(self, audio, keyshift=0, speed=1, center=True):
+        factor = 2 ** (keyshift / 12)
+        n_fft_new = int(np.round(self.n_fft * factor))
+        win_length_new = int(np.round(self.win_length * factor))
+        hop_length_new = int(np.round(self.hop_length * speed))
+        keyshift_key = str(keyshift) + "_" + str(audio.device)
+        if keyshift_key not in self.hann_window:
+            self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
+                audio.device
+            )
+        fft = torch.stft(
+            audio,
+            n_fft=n_fft_new,
+            hop_length=hop_length_new,
+            win_length=win_length_new,
+            window=self.hann_window[keyshift_key],
+            center=center,
+            return_complex=True,
+        )
+        magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
+        if keyshift != 0:
+            size = self.n_fft // 2 + 1
+            resize = magnitude.size(1)
+            if resize < size:
+                magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
+            magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
+        mel_output = torch.matmul(self.mel_basis, magnitude)
+        if self.is_half == True:
+            mel_output = mel_output.half()
+        log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
+        return log_mel_spec
+class RMVPE:
+    def __init__(self, model_path, is_half, device=None):
+        self.resample_kernel = {}
+        model = E2E(4, 1, (2, 2))
+        ckpt = torch.load(model_path, map_location="cpu")
+        model.load_state_dict(ckpt)
+        model.eval()
+        if is_half == True:
+            model = model.half()
+        self.model = model
+        self.resample_kernel = {}
+        self.is_half = is_half
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.mel_extractor = MelSpectrogram(
+            is_half, 128, 16000, 1024, 160, None, 30, 8000
+        ).to(device)
+        self.model = self.model.to(device)
+        cents_mapping = 20 * np.arange(360) + 1997.3794084376191
+        self.cents_mapping = np.pad(cents_mapping, (4, 4))  # 368
+    def mel2hidden(self, mel):
+        with torch.no_grad():
+            n_frames = mel.shape[-1]
+            mel = F.pad(
+                mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
+            )
+            hidden = self.model(mel)
+            return hidden[:, :n_frames]
+    def decode(self, hidden, thred=0.03):
+        cents_pred = self.to_local_average_cents(hidden, thred=thred)
+        f0 = 10 * (2 ** (cents_pred / 1200))
+        f0[f0 == 10] = 0
+        # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
+        return f0
+    def infer_from_audio(self, audio, thred=0.03):
+        audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
+        # torch.cuda.synchronize()
+        # t0=ttime()
+        mel = self.mel_extractor(audio, center=True)
+        # torch.cuda.synchronize()
+        # t1=ttime()
+        hidden = self.mel2hidden(mel)
+        # torch.cuda.synchronize()
+        # t2=ttime()
+        hidden = hidden.squeeze(0).cpu().numpy()
+        if self.is_half == True:
+            hidden = hidden.astype("float32")
+        f0 = self.decode(hidden, thred=thred)
+        # torch.cuda.synchronize()
+        # t3=ttime()
+        # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
+        return f0
+    def to_local_average_cents(self, salience, thred=0.05):
+        # t0 = ttime()
+        center = np.argmax(salience, axis=1)  # 帧长#index
+        salience = np.pad(salience, ((0, 0), (4, 4)))  # 帧长,368
+        # t1 = ttime()
+        center += 4
+        todo_salience = []
+        todo_cents_mapping = []
+        starts = center - 4
+        ends = center + 5
+        for idx in range(salience.shape[0]):
+            todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
+            todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
+        # t2 = ttime()
+        todo_salience = np.array(todo_salience)  # 帧长，9
+        todo_cents_mapping = np.array(todo_cents_mapping)  # 帧长，9
+        product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
+        weight_sum = np.sum(todo_salience, 1)  # 帧长
+        devided = product_sum / weight_sum  # 帧长
+        # t3 = ttime()
+        maxx = np.max(salience, axis=1)  # 帧长
+        devided[maxx <= thred] = 0
+        # t4 = ttime()
+        # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
+        return devided

src/rvc.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from multiprocessing import cpu_count
+from pathlib import Path
+import torch
+from fairseq import checkpoint_utils
+from scipy.io import wavfile
+from src.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid,
+    SynthesizerTrnMs256NSFsid_nono,
+    SynthesizerTrnMs768NSFsid,
+    SynthesizerTrnMs768NSFsid_nono,
+)
+from src.my_utils import load_audio
+from src.vc_infer_pipeline import VC
+BASE_DIR = Path(__file__).resolve().parent.parent
+class Config:
+    def __init__(self, device, is_half):
+        self.device = device
+        self.is_half = is_half
+        self.n_cpu = 0
+        self.gpu_name = None
+        self.gpu_mem = None
+        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
+    def device_config(self) -> tuple:
+        if torch.cuda.is_available():
+            i_device = int(self.device.split(":")[-1])
+            self.gpu_name = torch.cuda.get_device_name(i_device)
+            if (
+                    ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
+                    or "P40" in self.gpu_name.upper()
+                    or "1060" in self.gpu_name
+                    or "1070" in self.gpu_name
+                    or "1080" in self.gpu_name
+            ):
+                print("16 series/10 series P40 forced single precision")
+                self.is_half = False
+                for config_file in ["32k.json", "40k.json", "48k.json"]:
+                    with open(BASE_DIR / "src" / "configs" / config_file, "r") as f:
+                        strr = f.read().replace("true", "false")
+                    with open(BASE_DIR / "src" / "configs" / config_file, "w") as f:
+                        f.write(strr)
+                with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "r") as f:
+                    strr = f.read().replace("3.7", "3.0")
+                with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "w") as f:
+                    f.write(strr)
+            else:
+                self.gpu_name = None
+            self.gpu_mem = int(
+                torch.cuda.get_device_properties(i_device).total_memory
+                / 1024
+                / 1024
+                / 1024
+                + 0.4
+            )
+            if self.gpu_mem <= 4:
+                with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "r") as f:
+                    strr = f.read().replace("3.7", "3.0")
+                with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "w") as f:
+                    f.write(strr)
+        elif torch.backends.mps.is_available():
+            print("No supported N-card found, use MPS for inference")
+            self.device = "mps"
+        else:
+            print("No supported N-card found, use CPU for inference")
+            self.device = "cpu"
+            self.is_half = True
+        if self.n_cpu == 0:
+            self.n_cpu = cpu_count()
+        if self.is_half:
+            # 6G memory config
+            x_pad = 3
+            x_query = 10
+            x_center = 60
+            x_max = 65
+        else:
+            # 5G memory config
+            x_pad = 1
+            x_query = 6
+            x_center = 38
+            x_max = 41
+        if self.gpu_mem != None and self.gpu_mem <= 4:
+            x_pad = 1
+            x_query = 5
+            x_center = 30
+            x_max = 32
+        return x_pad, x_query, x_center, x_max
+def load_hubert(device, is_half, model_path):
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([model_path], suffix='', )
+    hubert = models[0]
+    hubert = hubert.to(device)
+    if is_half:
+        hubert = hubert.half()
+    else:
+        hubert = hubert.float()
+    hubert.eval()
+    return hubert
+def get_vc(device, is_half, config, model_path):
+    cpt = torch.load(model_path, map_location='cpu')
+    if "config" not in cpt or "weight" not in cpt:
+        raise ValueError(f'Incorrect format for {model_path}. Use a voice model trained using RVC v2 instead.')
+    tgt_sr = cpt["config"][-1]
+    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
+    if_f0 = cpt.get("f0", 1)
+    version = cpt.get("version", "v1")
+    if version == "v1":
+        if if_f0 == 1:
+            net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
+        else:
+            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+    elif version == "v2":
+        if if_f0 == 1:
+            net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
+        else:
+            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+    del net_g.enc_q
+    print(net_g.load_state_dict(cpt["weight"], strict=False))
+    net_g.eval().to(device)
+    if is_half:
+        net_g = net_g.half()
+    else:
+        net_g = net_g.float()
+    vc = VC(tgt_sr, config)
+    return cpt, version, net_g, tgt_sr, vc
+def rvc_infer(index_path, index_rate, input_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model):
+    audio = load_audio(input_path, 16000)
+    times = [0, 0, 0]
+    if_f0 = cpt.get('f0', 1)
+    audio_opt = vc.pipeline(hubert_model, net_g, 0, audio, input_path, times, pitch_change, f0_method, index_path, index_rate, if_f0, filter_radius, tgt_sr, 0, rms_mix_rate, version, protect, crepe_hop_length)
+    wavfile.write(output_path, tgt_sr, audio_opt)

src/trainset_preprocess_pipeline_print.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import sys, os, multiprocessing
+from scipy import signal
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+inp_root = sys.argv[1]
+sr = int(sys.argv[2])
+n_p = int(sys.argv[3])
+exp_dir = sys.argv[4]
+noparallel = sys.argv[5] == "True"
+import numpy as np, os, traceback
+from slicer2 import Slicer
+import librosa, traceback
+from scipy.io import wavfile
+import multiprocessing
+from my_utils import load_audio
+import tqdm
+DoFormant = False
+Quefrency = 1.0
+Timbre = 1.0
+mutex = multiprocessing.Lock()
+f = open("%s/preprocess.log" % exp_dir, "a+")
+def println(strr):
+    mutex.acquire()
+    print(strr)
+    f.write("%s\n" % strr)
+    f.flush()
+    mutex.release()
+class PreProcess:
+    def __init__(self, sr, exp_dir):
+        self.slicer = Slicer(
+            sr=sr,
+            threshold=-42,
+            min_length=1500,
+            min_interval=400,
+            hop_size=15,
+            max_sil_kept=500,
+        )
+        self.sr = sr
+        self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
+        self.per = 3.0
+        self.overlap = 0.3
+        self.tail = self.per + self.overlap
+        self.max = 0.9
+        self.alpha = 0.75
+        self.exp_dir = exp_dir
+        self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
+        self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
+        os.makedirs(self.exp_dir, exist_ok=True)
+        os.makedirs(self.gt_wavs_dir, exist_ok=True)
+        os.makedirs(self.wavs16k_dir, exist_ok=True)
+    def norm_write(self, tmp_audio, idx0, idx1):
+        tmp_max = np.abs(tmp_audio).max()
+        if tmp_max > 2.5:
+            print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
+            return
+        tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
+            1 - self.alpha
+        ) * tmp_audio
+        wavfile.write(
+            "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
+            self.sr,
+            tmp_audio.astype(np.float32),
+        )
+        tmp_audio = librosa.resample(
+            tmp_audio, orig_sr=self.sr, target_sr=16000
+        )  # , res_type="soxr_vhq"
+        wavfile.write(
+            "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
+            16000,
+            tmp_audio.astype(np.float32),
+        )
+    def pipeline(self, path, idx0):
+        try:
+            audio = load_audio(path, self.sr, DoFormant, Quefrency, Timbre)
+            # zero phased digital filter cause pre-ringing noise...
+            # audio = signal.filtfilt(self.bh, self.ah, audio)
+            audio = signal.lfilter(self.bh, self.ah, audio)
+            idx1 = 0
+            for audio in self.slicer.slice(audio):
+                i = 0
+                while 1:
+                    start = int(self.sr * (self.per - self.overlap) * i)
+                    i += 1
+                    if len(audio[start:]) > self.tail * self.sr:
+                        tmp_audio = audio[start : start + int(self.per * self.sr)]
+                        self.norm_write(tmp_audio, idx0, idx1)
+                        idx1 += 1
+                    else:
+                        tmp_audio = audio[start:]
+                        idx1 += 1
+                        break
+                self.norm_write(tmp_audio, idx0, idx1)
+            # println("%s->Suc." % path)
+        except:
+            println("%s->%s" % (path, traceback.format_exc()))
+    def pipeline_mp(self, infos, thread_n):
+        for path, idx0 in tqdm.tqdm(
+            infos, position=thread_n, leave=True, desc="thread:%s" % thread_n
+        ):
+            self.pipeline(path, idx0)
+    def pipeline_mp_inp_dir(self, inp_root, n_p):
+        try:
+            infos = [
+                ("%s/%s" % (inp_root, name), idx)
+                for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
+            ]
+            if noparallel:
+                for i in range(n_p):
+                    self.pipeline_mp(infos[i::n_p])
+            else:
+                ps = []
+                for i in range(n_p):
+                    p = multiprocessing.Process(
+                        target=self.pipeline_mp, args=(infos[i::n_p], i)
+                    )
+                    ps.append(p)
+                    p.start()
+                for i in range(n_p):
+                    ps[i].join()
+        except:
+            println("Fail. %s" % traceback.format_exc())
+def preprocess_trainset(inp_root, sr, n_p, exp_dir):
+    pp = PreProcess(sr, exp_dir)
+    println("start preprocess")
+    println(sys.argv)
+    pp.pipeline_mp_inp_dir(inp_root, n_p)
+    println("end preprocess")
+if __name__ == "__main__":
+    preprocess_trainset(inp_root, sr, n_p, exp_dir)

src/vc_infer_pipeline.py ADDED Viewed

	@@ -0,0 +1,653 @@

+from functools import lru_cache
+from time import time as ttime
+import faiss
+import librosa
+import numpy as np
+import os
+import parselmouth
+import pyworld
+import sys
+import torch
+import torch.nn.functional as F
+import torchcrepe
+import traceback
+from scipy import signal
+from torch import Tensor
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+now_dir = os.path.join(BASE_DIR, 'src')
+sys.path.append(now_dir)
+bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
+input_audio_path2wav = {}
+@lru_cache
+def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
+    audio = input_audio_path2wav[input_audio_path]
+    f0, t = pyworld.harvest(
+        audio,
+        fs=fs,
+        f0_ceil=f0max,
+        f0_floor=f0min,
+        frame_period=frame_period,
+    )
+    f0 = pyworld.stonemask(audio, f0, t, fs)
+    return f0
+def change_rms(data1, sr1, data2, sr2, rate):  # 1是输入音频，2是输出音频,rate是2的占比
+    # print(data1.max(),data2.max())
+    rms1 = librosa.feature.rms(
+        y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
+    )  # 每半秒一个点
+    rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
+    rms1 = torch.from_numpy(rms1)
+    rms1 = F.interpolate(
+        rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
+    ).squeeze()
+    rms2 = torch.from_numpy(rms2)
+    rms2 = F.interpolate(
+        rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
+    ).squeeze()
+    rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
+    data2 *= (
+        torch.pow(rms1, torch.tensor(1 - rate))
+        * torch.pow(rms2, torch.tensor(rate - 1))
+    ).numpy()
+    return data2
+class VC(object):
+    def __init__(self, tgt_sr, config):
+        self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
+            config.x_pad,
+            config.x_query,
+            config.x_center,
+            config.x_max,
+            config.is_half,
+        )
+        self.sr = 16000  # hubert输入采样率
+        self.window = 160  # 每帧点数
+        self.t_pad = self.sr * self.x_pad  # 每条前后pad时间
+        self.t_pad_tgt = tgt_sr * self.x_pad
+        self.t_pad2 = self.t_pad * 2
+        self.t_query = self.sr * self.x_query  # 查询切点前后查询时间
+        self.t_center = self.sr * self.x_center  # 查询切点位置
+        self.t_max = self.sr * self.x_max  # 免查询时长阈值
+        self.device = config.device
+    # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
+    def get_optimal_torch_device(self, index: int = 0) -> torch.device:
+        # Get cuda device
+        if torch.cuda.is_available():
+            return torch.device(
+                f"cuda:{index % torch.cuda.device_count()}"
+            )  # Very fast
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
+        # Else wise return the "cpu" as a torch device,
+        return torch.device("cpu")
+    # Fork Feature: Compute f0 with the crepe method
+    def get_f0_crepe_computation(
+        self,
+        x,
+        f0_min,
+        f0_max,
+        p_len,
+        hop_length=160,  # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
+        model="full",  # Either use crepe-tiny "tiny" or crepe "full". Default is full
+    ):
+        x = x.astype(
+            np.float32
+        )  # fixes the F.conv2D exception. We needed to convert double to float.
+        x /= np.quantile(np.abs(x), 0.999)
+        torch_device = self.get_optimal_torch_device()
+        audio = torch.from_numpy(x).to(torch_device, copy=True)
+        audio = torch.unsqueeze(audio, dim=0)
+        if audio.ndim == 2 and audio.shape[0] > 1:
+            audio = torch.mean(audio, dim=0, keepdim=True).detach()
+        audio = audio.detach()
+        print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
+        pitch: Tensor = torchcrepe.predict(
+            audio,
+            self.sr,
+            hop_length,
+            f0_min,
+            f0_max,
+            model,
+            batch_size=hop_length * 2,
+            device=torch_device,
+            pad=True,
+        )
+        p_len = p_len or x.shape[0] // hop_length
+        # Resize the pitch for final f0
+        source = np.array(pitch.squeeze(0).cpu().float().numpy())
+        source[source < 0.001] = np.nan
+        target = np.interp(
+            np.arange(0, len(source) * p_len, len(source)) / p_len,
+            np.arange(0, len(source)),
+            source,
+        )
+        f0 = np.nan_to_num(target)
+        return f0  # Resized f0
+    def get_f0_official_crepe_computation(
+        self,
+        x,
+        f0_min,
+        f0_max,
+        model="full",
+    ):
+        # Pick a batch size that doesn't cause memory errors on your gpu
+        batch_size = 512
+        # Compute pitch using first gpu
+        audio = torch.tensor(np.copy(x))[None].float()
+        f0, pd = torchcrepe.predict(
+            audio,
+            self.sr,
+            self.window,
+            f0_min,
+            f0_max,
+            model,
+            batch_size=batch_size,
+            device=self.device,
+            return_periodicity=True,
+        )
+        pd = torchcrepe.filter.median(pd, 3)
+        f0 = torchcrepe.filter.mean(f0, 3)
+        f0[pd < 0.1] = 0
+        f0 = f0[0].cpu().numpy()
+        return f0
+    # Fork Feature: Compute pYIN f0 method
+    def get_f0_pyin_computation(self, x, f0_min, f0_max):
+        y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
+        f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
+        f0 = f0[1:]  # Get rid of extra first frame
+        return f0
+    # Fork Feature: Acquire median hybrid f0 estimation calculation
+    def get_f0_hybrid_computation(
+        self,
+        methods_str,
+        input_audio_path,
+        x,
+        f0_min,
+        f0_max,
+        p_len,
+        filter_radius,
+        crepe_hop_length,
+        time_step,
+    ):
+        # Get various f0 methods from input to use in the computation stack
+        s = methods_str
+        s = s.split("hybrid")[1]
+        s = s.replace("[", "").replace("]", "")
+        methods = s.split("+")
+        f0_computation_stack = []
+        print("Calculating f0 pitch estimations for methods: %s" % str(methods))
+        x = x.astype(np.float32)
+        x /= np.quantile(np.abs(x), 0.999)
+        # Get f0 calculations for all methods specified
+        for method in methods:
+            f0 = None
+            if method == "pm":
+                f0 = (
+                    parselmouth.Sound(x, self.sr)
+                    .to_pitch_ac(
+                        time_step=time_step / 1000,
+                        voicing_threshold=0.6,
+                        pitch_floor=f0_min,
+                        pitch_ceiling=f0_max,
+                    )
+                    .selected_array["frequency"]
+                )
+                pad_size = (p_len - len(f0) + 1) // 2
+                if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+                    f0 = np.pad(
+                        f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
+                    )
+            elif method == "crepe":
+                f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
+                f0 = f0[1:]  # Get rid of extra first frame
+            elif method == "crepe-tiny":
+                f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
+                f0 = f0[1:]  # Get rid of extra first frame
+            elif method == "mangio-crepe":
+                f0 = self.get_f0_crepe_computation(
+                    x, f0_min, f0_max, p_len, crepe_hop_length
+                )
+            elif method == "mangio-crepe-tiny":
+                f0 = self.get_f0_crepe_computation(
+                    x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
+                )
+            elif method == "harvest":
+                f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
+                if filter_radius > 2:
+                    f0 = signal.medfilt(f0, 3)
+                f0 = f0[1:]  # Get rid of first frame.
+            elif method == "dio":  # Potentially buggy?
+                f0, t = pyworld.dio(
+                    x.astype(np.double),
+                    fs=self.sr,
+                    f0_ceil=f0_max,
+                    f0_floor=f0_min,
+                    frame_period=10,
+                )
+                f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
+                f0 = signal.medfilt(f0, 3)
+                f0 = f0[1:]
+            # elif method == "pyin": Not Working just yet
+            #    f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
+            # Push method to the stack
+            f0_computation_stack.append(f0)
+        for fc in f0_computation_stack:
+            print(len(fc))
+        print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
+        f0_median_hybrid = None
+        if len(f0_computation_stack) == 1:
+            f0_median_hybrid = f0_computation_stack[0]
+        else:
+            f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
+        return f0_median_hybrid
+    def get_f0(
+        self,
+        input_audio_path,
+        x,
+        p_len,
+        f0_up_key,
+        f0_method,
+        filter_radius,
+        crepe_hop_length,
+        inp_f0=None,
+    ):
+        global input_audio_path2wav
+        time_step = self.window / self.sr * 1000
+        f0_min = 50
+        f0_max = 1100
+        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+        if f0_method == "pm":
+            f0 = (
+                parselmouth.Sound(x, self.sr)
+                .to_pitch_ac(
+                    time_step=time_step / 1000,
+                    voicing_threshold=0.6,
+                    pitch_floor=f0_min,
+                    pitch_ceiling=f0_max,
+                )
+                .selected_array["frequency"]
+            )
+            pad_size = (p_len - len(f0) + 1) // 2
+            if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+                f0 = np.pad(
+                    f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
+                )
+        elif f0_method == "harvest":
+            input_audio_path2wav[input_audio_path] = x.astype(np.double)
+            f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
+            if filter_radius > 2:
+                f0 = signal.medfilt(f0, 3)
+        elif f0_method == "dio":  # Potentially Buggy?
+            f0, t = pyworld.dio(
+                x.astype(np.double),
+                fs=self.sr,
+                f0_ceil=f0_max,
+                f0_floor=f0_min,
+                frame_period=10,
+            )
+            f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
+            f0 = signal.medfilt(f0, 3)
+        elif f0_method == "crepe":
+            f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
+        elif f0_method == "crepe-tiny":
+            f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
+        elif f0_method == "mangio-crepe":
+            f0 = self.get_f0_crepe_computation(
+                x, f0_min, f0_max, p_len, crepe_hop_length
+            )
+        elif f0_method == "mangio-crepe-tiny":
+            f0 = self.get_f0_crepe_computation(
+                x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
+            )
+        elif f0_method == "rmvpe":
+            if hasattr(self, "model_rmvpe") == False:
+                from rmvpe import RMVPE
+                self.model_rmvpe = RMVPE(
+                    os.path.join(BASE_DIR, 'rvc_models', 'rmvpe.pt'), is_half=self.is_half, device=self.device
+                )
+            f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
+        elif "hybrid" in f0_method:
+            # Perform hybrid median pitch estimation
+            input_audio_path2wav[input_audio_path] = x.astype(np.double)
+            f0 = self.get_f0_hybrid_computation(
+                f0_method,
+                input_audio_path,
+                x,
+                f0_min,
+                f0_max,
+                p_len,
+                filter_radius,
+                crepe_hop_length,
+                time_step,
+            )
+        f0 *= pow(2, f0_up_key / 12)
+        # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
+        tf0 = self.sr // self.window  # 每秒f0点数
+        if inp_f0 is not None:
+            delta_t = np.round(
+                (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
+            ).astype("int16")
+            replace_f0 = np.interp(
+                list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
+            )
+            shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
+            f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
+                :shape
+            ]
+        # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
+        f0bak = f0.copy()
+        f0_mel = 1127 * np.log(1 + f0 / 700)
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
+            f0_mel_max - f0_mel_min
+        ) + 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > 255] = 255
+        f0_coarse = np.rint(f0_mel).astype(np.int_)
+        return f0_coarse, f0bak  # 1-0
+    def vc(
+        self,
+        model,
+        net_g,
+        sid,
+        audio0,
+        pitch,
+        pitchf,
+        times,
+        index,
+        big_npy,
+        index_rate,
+        version,
+        protect,
+    ):  # ,file_index,file_big_npy
+        feats = torch.from_numpy(audio0)
+        if self.is_half:
+            feats = feats.half()
+        else:
+            feats = feats.float()
+        if feats.dim() == 2:  # double channels
+            feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
+        inputs = {
+            "source": feats.to(self.device),
+            "padding_mask": padding_mask,
+            "output_layer": 9 if version == "v1" else 12,
+        }
+        t0 = ttime()
+        with torch.no_grad():
+            logits = model.extract_features(**inputs)
+            feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
+        if protect < 0.5 and pitch != None and pitchf != None:
+            feats0 = feats.clone()
+        if (
+            isinstance(index, type(None)) == False
+            and isinstance(big_npy, type(None)) == False
+            and index_rate != 0
+        ):
+            npy = feats[0].cpu().numpy()
+            if self.is_half:
+                npy = npy.astype("float32")
+            # _, I = index.search(npy, 1)
+            # npy = big_npy[I.squeeze()]
+            score, ix = index.search(npy, k=8)
+            weight = np.square(1 / score)
+            weight /= weight.sum(axis=1, keepdims=True)
+            npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
+            if self.is_half:
+                npy = npy.astype("float16")
+            feats = (
+                torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+                + (1 - index_rate) * feats
+            )
+        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+        if protect < 0.5 and pitch != None and pitchf != None:
+            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
+                0, 2, 1
+            )
+        t1 = ttime()
+        p_len = audio0.shape[0] // self.window
+        if feats.shape[1] < p_len:
+            p_len = feats.shape[1]
+            if pitch != None and pitchf != None:
+                pitch = pitch[:, :p_len]
+                pitchf = pitchf[:, :p_len]
+        if protect < 0.5 and pitch != None and pitchf != None:
+            pitchff = pitchf.clone()
+            pitchff[pitchf > 0] = 1
+            pitchff[pitchf < 1] = protect
+            pitchff = pitchff.unsqueeze(-1)
+            feats = feats * pitchff + feats0 * (1 - pitchff)
+            feats = feats.to(feats0.dtype)
+        p_len = torch.tensor([p_len], device=self.device).long()
+        with torch.no_grad():
+            if pitch != None and pitchf != None:
+                audio1 = (
+                    (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
+                    .data.cpu()
+                    .float()
+                    .numpy()
+                )
+            else:
+                audio1 = (
+                    (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
+                )
+        del feats, p_len, padding_mask
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        t2 = ttime()
+        times[0] += t1 - t0
+        times[2] += t2 - t1
+        return audio1
+    def pipeline(
+        self,
+        model,
+        net_g,
+        sid,
+        audio,
+        input_audio_path,
+        times,
+        f0_up_key,
+        f0_method,
+        file_index,
+        # file_big_npy,
+        index_rate,
+        if_f0,
+        filter_radius,
+        tgt_sr,
+        resample_sr,
+        rms_mix_rate,
+        version,
+        protect,
+        crepe_hop_length,
+        f0_file=None,
+    ):
+        if (
+            file_index != ""
+            # and file_big_npy != ""
+            # and os.path.exists(file_big_npy) == True
+            and os.path.exists(file_index) == True
+            and index_rate != 0
+        ):
+            try:
+                index = faiss.read_index(file_index)
+                # big_npy = np.load(file_big_npy)
+                big_npy = index.reconstruct_n(0, index.ntotal)
+            except:
+                traceback.print_exc()
+                index = big_npy = None
+        else:
+            index = big_npy = None
+        audio = signal.filtfilt(bh, ah, audio)
+        audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
+        opt_ts = []
+        if audio_pad.shape[0] > self.t_max:
+            audio_sum = np.zeros_like(audio)
+            for i in range(self.window):
+                audio_sum += audio_pad[i : i - self.window]
+            for t in range(self.t_center, audio.shape[0], self.t_center):
+                opt_ts.append(
+                    t
+                    - self.t_query
+                    + np.where(
+                        np.abs(audio_sum[t - self.t_query : t + self.t_query])
+                        == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
+                    )[0][0]
+                )
+        s = 0
+        audio_opt = []
+        t = None
+        t1 = ttime()
+        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
+        p_len = audio_pad.shape[0] // self.window
+        inp_f0 = None
+        if hasattr(f0_file, "name") == True:
+            try:
+                with open(f0_file.name, "r") as f:
+                    lines = f.read().strip("\n").split("\n")
+                inp_f0 = []
+                for line in lines:
+                    inp_f0.append([float(i) for i in line.split(",")])
+                inp_f0 = np.array(inp_f0, dtype="float32")
+            except:
+                traceback.print_exc()
+        sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
+        pitch, pitchf = None, None
+        if if_f0 == 1:
+            pitch, pitchf = self.get_f0(
+                input_audio_path,
+                audio_pad,
+                p_len,
+                f0_up_key,
+                f0_method,
+                filter_radius,
+                crepe_hop_length,
+                inp_f0,
+            )
+            pitch = pitch[:p_len]
+            pitchf = pitchf[:p_len]
+            if self.device == "mps":
+                pitchf = pitchf.astype(np.float32)
+            pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
+            pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
+        t2 = ttime()
+        times[1] += t2 - t1
+        for t in opt_ts:
+            t = t // self.window * self.window
+            if if_f0 == 1:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        pitch[:, s // self.window : (t + self.t_pad2) // self.window],
+                        pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            else:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        None,
+                        None,
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            s = t
+        if if_f0 == 1:
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    pitch[:, t // self.window :] if t is not None else pitch,
+                    pitchf[:, t // self.window :] if t is not None else pitchf,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+        else:
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    None,
+                    None,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+        audio_opt = np.concatenate(audio_opt)
+        if rms_mix_rate != 1:
+            audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
+        if resample_sr >= 16000 and tgt_sr != resample_sr:
+            audio_opt = librosa.resample(
+                audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
+            )
+        audio_max = np.abs(audio_opt).max() / 0.99
+        max_int16 = 32768
+        if audio_max > 1:
+            max_int16 /= audio_max
+        audio_opt = (audio_opt * max_int16).astype(np.int16)
+        del pitch, pitchf, sid
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return audio_opt