github-actions[bot] commited on
Commit
f80c5ec
0 Parent(s):

Sync to HuggingFace Spaces

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.github/workflows/sync.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face Spaces
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ sync:
10
+ name: Sync
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - name: Checkout Repository
15
+ uses: actions/checkout@v4
16
+ with:
17
+ lfs: true
18
+
19
+ - name: Sync to Hugging Face Spaces
20
+ uses: JacobLinCool/huggingface-sync@v1
21
+ with:
22
+ github: ${{ secrets.GITHUB_TOKEN }}
23
+ user: jacoblincool # Hugging Face username or organization name
24
+ space: ZeroRVC # Hugging Face space name
25
+ token: ${{ secrets.HF_TOKEN }} # Hugging Face token
26
+ configuration: headers.yaml
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ *.pyc
3
+ __pycache__
4
+ dist/
5
+ logs/
6
+ separated/
LICENSE ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2024 Jacob Lin <[email protected]>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ZeroRVC
3
+ emoji: 🎙️
4
+ colorFrom: gray
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 4.37.2
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # ZeroRVC
13
+
14
+ Run Retrieval-based Voice Conversion training and inference with ease.
15
+
16
+ ## Features
17
+
18
+ - [x] Dataset Preparation
19
+ - [x] Hugging Face Datasets Integration
20
+ - [x] Hugging Face Accelerate Integration
21
+ - [x] Trainer API
22
+ - [x] Inference API
23
+ - [ ] Index Support
24
+ - [ ] Tensorboard Support
25
+ - [ ] FP16 Support
26
+
27
+ ## Dataset Preparation
28
+
29
+ ZeroRVC provides a simple API to prepare your dataset for training. You only need to provide the path to your audio files. The feature extraction models will be downloaded automatically, or you can provide your own with the `hubert` and `rmvpe` arguments.
30
+
31
+ ```py
32
+ from zerorvc import prepare
33
+
34
+ dataset = prepare("./my-voices")
35
+ ```
36
+
37
+ Since `dataset` is a Hugging Face Dataset object, you can easily push it to the Hugging Face Hub.
38
+
39
+ ```py
40
+ dataset.push_to_hub("my-rvc-dataset", token=HF_TOKEN)
41
+ ```
42
+
43
+ And bring the preprocessed dataset back with the following code.
44
+
45
+ ```py
46
+ from datasets import load_dataset
47
+
48
+ dataset = load_dataset("my-rvc-dataset")
49
+ ```
50
+
51
+ ## Training
52
+
53
+ Once you've prepared your dataset, you can start training your model with the `RVCTrainer`.
54
+
55
+ ```py
56
+ from tqdm import tqdm
57
+ from zerorvc import RVCTrainer
58
+
59
+ epochs = 100
60
+ trainer = RVCTrainer(checkpoint_dir="./checkpoints")
61
+ training = tqdm(
62
+ trainer.train(
63
+ dataset=dataset["train"], # preprocessed dataset
64
+ resume_from=trainer.latest_checkpoint(), # resume training from the latest checkpoint if any
65
+ epochs=epochs, batch_size=8
66
+ )
67
+ )
68
+
69
+ # Training loop: iterate over epochs
70
+ for checkpoint in training:
71
+ training.set_description(
72
+ f"Epoch {checkpoint.epoch}/{epochs} loss: (gen: {checkpoint.loss_gen:.4f}, fm: {checkpoint.loss_fm:.4f}, mel: {checkpoint.loss_mel:.4f}, kl: {checkpoint.loss_kl:.4f}, disc: {checkpoint.loss_disc:.4f})"
73
+ )
74
+
75
+ # Save checkpoint every 10 epochs
76
+ if checkpoint.epoch % 10 == 0:
77
+ checkpoint.save(checkpoint_dir=trainer.checkpoint_dir)
78
+ # Directly push the synthesizer to the Hugging Face Hub
79
+ checkpoint.G.push_to_hub("my-rvc-model", token=HF_TOKEN)
80
+
81
+ print("Training completed.")
82
+ ```
83
+
84
+ You can also push the whole GAN weights to the Hugging Face Hub.
85
+
86
+ ```py
87
+ checkpoint.push_to_hub("my-rvc-model", token=HF_TOKEN)
88
+ ```
89
+
90
+ ## Inference
91
+
92
+ ZeroRVC provides an easy API to convert your voice with the trained model.
93
+
94
+ ```py
95
+ from zerorvc import RVC
96
+ import soundfile as sf
97
+
98
+ rvc = RVC.from_pretrained("my-rvc-model")
99
+ samples = rvc.convert("test.mp3")
100
+ sf.write("output.wav", samples, rvc.sr)
101
+ ```
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from app.settings import SettingsTab
3
+ from app.tutorial import TutotialTab
4
+ from app.dataset import DatasetTab
5
+ from app.train import TrainTab
6
+ from app.infer import InferenceTab
7
+ from app.zero import zero_is_available
8
+
9
+ if zero_is_available:
10
+ import torch
11
+
12
+ torch.backends.cuda.matmul.allow_tf32 = True
13
+
14
+
15
+ with gr.Blocks() as app:
16
+ gr.Markdown("# ZeroRVC")
17
+ gr.Markdown(
18
+ "Run Retrieval-based Voice Conversion training and inference on Hugging Face ZeroGPU or locally."
19
+ )
20
+
21
+ settings = SettingsTab()
22
+ tutorial = TutotialTab()
23
+ dataset = DatasetTab()
24
+ training = TrainTab()
25
+ inference = InferenceTab()
26
+
27
+ with gr.Accordion(label="Environment Settings"):
28
+ settings.ui()
29
+
30
+ with gr.Tabs():
31
+ with gr.Tab(label="Tutorial", id=0):
32
+ tutorial.ui()
33
+
34
+ with gr.Tab(label="Dataset", id=1):
35
+ dataset.ui()
36
+
37
+ with gr.Tab(label="Training", id=2):
38
+ training.ui()
39
+
40
+ with gr.Tab(label="Inference", id=3):
41
+ inference.ui()
42
+
43
+ settings.build()
44
+ tutorial.build()
45
+ dataset.build(settings.exp_dir, settings.hf_token)
46
+ training.build(settings.exp_dir, settings.hf_token)
47
+ inference.build(settings.exp_dir)
48
+
49
+ app.launch()
app/__init__.py ADDED
File without changes
app/constants.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ HF_TOKEN = os.environ.get("HF_TOKEN")
5
+
6
+ ROOT_EXP_DIR = Path(
7
+ os.environ.get("ROOT_EXP_DIR")
8
+ or os.path.join(os.path.dirname(os.path.abspath(__file__)), "../logs")
9
+ ).resolve()
10
+ ROOT_EXP_DIR.mkdir(exist_ok=True, parents=True)
11
+
12
+ BATCH_SIZE = int(os.environ.get("BATCH_SIZE") or 8)
13
+ TRAINING_EPOCHS = int(os.environ.get("TRAINING_EPOCHS") or 10)
app/dataset.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import zipfile
4
+ import tempfile
5
+ from zerorvc import prepare
6
+ from datasets import load_dataset, load_from_disk
7
+ from .constants import ROOT_EXP_DIR, BATCH_SIZE
8
+ from .zero import zero
9
+ from .model import accelerator
10
+
11
+
12
+ def extract_audio_files(zip_file: str, target_dir: str) -> list[str]:
13
+ with zipfile.ZipFile(zip_file, "r") as zip_ref:
14
+ zip_ref.extractall(target_dir)
15
+
16
+ audio_files = [
17
+ os.path.join(target_dir, f)
18
+ for f in os.listdir(target_dir)
19
+ if f.endswith((".wav", ".mp3", ".ogg"))
20
+ ]
21
+ if not audio_files:
22
+ raise gr.Error("No audio files found at the top level of the zip file")
23
+
24
+ return audio_files
25
+
26
+
27
+ def make_dataset_from_zip(exp_dir: str, zip_file: str):
28
+ if not exp_dir:
29
+ exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR)
30
+ print(f"Using exp dir: {exp_dir}")
31
+
32
+ data_dir = os.path.join(exp_dir, "raw_data")
33
+ if not os.path.exists(data_dir):
34
+ os.makedirs(data_dir)
35
+ extract_audio_files(zip_file, data_dir)
36
+
37
+ ds = prepare(
38
+ data_dir,
39
+ accelerator=accelerator,
40
+ batch_size=BATCH_SIZE,
41
+ stage=1,
42
+ )
43
+
44
+ return exp_dir, str(ds)
45
+
46
+
47
+ @zero(duration=120)
48
+ def make_dataset_from_zip_stage_2(exp_dir: str):
49
+ data_dir = os.path.join(exp_dir, "raw_data")
50
+ ds = prepare(
51
+ data_dir,
52
+ accelerator=accelerator,
53
+ batch_size=BATCH_SIZE,
54
+ stage=2,
55
+ )
56
+ return exp_dir, str(ds)
57
+
58
+
59
+ def make_dataset_from_zip_stage_3(exp_dir: str):
60
+ data_dir = os.path.join(exp_dir, "raw_data")
61
+ ds = prepare(
62
+ data_dir,
63
+ accelerator=accelerator,
64
+ batch_size=BATCH_SIZE,
65
+ stage=3,
66
+ )
67
+
68
+ dataset = os.path.join(exp_dir, "dataset")
69
+ ds.save_to_disk(dataset)
70
+ return exp_dir, str(ds)
71
+
72
+
73
+ def make_dataset_from_repo(repo: str, hf_token: str):
74
+ ds = load_dataset(repo, token=hf_token)
75
+ ds = prepare(
76
+ ds,
77
+ accelerator=accelerator,
78
+ batch_size=BATCH_SIZE,
79
+ stage=1,
80
+ )
81
+ return str(ds)
82
+
83
+
84
+ @zero(duration=120)
85
+ def make_dataset_from_repo_stage_2(repo: str, hf_token: str):
86
+ ds = load_dataset(repo, token=hf_token)
87
+ ds = prepare(
88
+ ds,
89
+ accelerator=accelerator,
90
+ batch_size=BATCH_SIZE,
91
+ stage=2,
92
+ )
93
+ return str(ds)
94
+
95
+
96
+ def make_dataset_from_repo_stage_3(exp_dir: str, repo: str, hf_token: str):
97
+ ds = load_dataset(repo, token=hf_token)
98
+ ds = prepare(
99
+ ds,
100
+ accelerator=accelerator,
101
+ batch_size=BATCH_SIZE,
102
+ stage=3,
103
+ )
104
+
105
+ if not exp_dir:
106
+ exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR)
107
+ print(f"Using exp dir: {exp_dir}")
108
+
109
+ dataset = os.path.join(exp_dir, "dataset")
110
+ ds.save_to_disk(dataset)
111
+ return exp_dir, str(ds)
112
+
113
+
114
+ def use_dataset(exp_dir: str, repo: str, hf_token: str):
115
+ gr.Info("Fetching dataset")
116
+ ds = load_dataset(repo, token=hf_token)
117
+
118
+ if not exp_dir:
119
+ exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR)
120
+ print(f"Using exp dir: {exp_dir}")
121
+
122
+ dataset = os.path.join(exp_dir, "dataset")
123
+ ds.save_to_disk(dataset)
124
+ return exp_dir, str(ds)
125
+
126
+
127
+ def upload_dataset(exp_dir: str, repo: str, hf_token: str):
128
+ dataset = os.path.join(exp_dir, "dataset")
129
+ if not os.path.exists(dataset):
130
+ raise gr.Error("Dataset not found")
131
+
132
+ gr.Info("Uploading dataset")
133
+ ds = load_from_disk(dataset)
134
+ ds.push_to_hub(repo, token=hf_token, private=True)
135
+ gr.Info("Dataset uploaded successfully")
136
+
137
+
138
+ class DatasetTab:
139
+ def __init__(self):
140
+ pass
141
+
142
+ def ui(self):
143
+ gr.Markdown("# Dataset")
144
+ gr.Markdown("The suggested dataset size is > 5 minutes of audio.")
145
+
146
+ gr.Markdown("## Create Dataset from ZIP")
147
+ gr.Markdown(
148
+ "Create a dataset by simply upload a zip file containing audio files. The audio files should be at the top level of the zip file."
149
+ )
150
+ with gr.Row():
151
+ self.zip_file = gr.File(
152
+ label="Upload a zip file containing audio files",
153
+ file_types=["zip"],
154
+ )
155
+ self.make_ds_from_dir = gr.Button(
156
+ value="Create Dataset from ZIP", variant="primary"
157
+ )
158
+
159
+ gr.Markdown("## Create Dataset from Dataset Repository")
160
+ gr.Markdown(
161
+ "You can also create a dataset from any Hugging Face dataset repository that has 'audio' column."
162
+ )
163
+ with gr.Row():
164
+ self.repo = gr.Textbox(
165
+ label="Hugging Face Dataset Repository",
166
+ placeholder="username/dataset-name",
167
+ )
168
+ self.make_ds_from_repo = gr.Button(
169
+ value="Create Dataset from Repo", variant="primary"
170
+ )
171
+
172
+ gr.Markdown("## Sync Preprocessed Dataset")
173
+ gr.Markdown(
174
+ "After you have preprocessed the dataset, you can upload the dataset to Hugging Face. And fetch it back later directly."
175
+ )
176
+ with gr.Row():
177
+ self.preprocessed_repo = gr.Textbox(
178
+ label="Hugging Face Dataset Repository",
179
+ placeholder="username/dataset-name",
180
+ )
181
+ self.fetch_ds = gr.Button(value="Fetch Dataset", variant="primary")
182
+ self.upload_ds = gr.Button(value="Upload Dataset", variant="primary")
183
+
184
+ self.ds_state = gr.Textbox(label="Dataset Info", lines=5)
185
+
186
+ def build(self, exp_dir: gr.Textbox, hf_token: gr.Textbox):
187
+ self.make_ds_from_dir.click(
188
+ fn=make_dataset_from_zip,
189
+ inputs=[exp_dir, self.zip_file],
190
+ outputs=[exp_dir, self.ds_state],
191
+ ).success(
192
+ fn=make_dataset_from_zip_stage_2,
193
+ inputs=[exp_dir],
194
+ outputs=[exp_dir, self.ds_state],
195
+ ).success(
196
+ fn=make_dataset_from_zip_stage_3,
197
+ inputs=[exp_dir],
198
+ outputs=[exp_dir, self.ds_state],
199
+ )
200
+
201
+ self.make_ds_from_repo.click(
202
+ fn=make_dataset_from_repo,
203
+ inputs=[self.repo, hf_token],
204
+ outputs=[self.ds_state],
205
+ ).success(
206
+ fn=make_dataset_from_repo_stage_2,
207
+ inputs=[self.repo, hf_token],
208
+ outputs=[self.ds_state],
209
+ ).success(
210
+ fn=make_dataset_from_repo_stage_3,
211
+ inputs=[exp_dir, self.repo, hf_token],
212
+ outputs=[exp_dir, self.ds_state],
213
+ )
214
+
215
+ self.fetch_ds.click(
216
+ fn=use_dataset,
217
+ inputs=[exp_dir, self.preprocessed_repo, hf_token],
218
+ outputs=[exp_dir, self.ds_state],
219
+ )
220
+
221
+ self.upload_ds.click(
222
+ fn=upload_dataset,
223
+ inputs=[exp_dir, self.preprocessed_repo, hf_token],
224
+ outputs=[],
225
+ )
app/dataset_maker.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yt_dlp
2
+ import numpy as np
3
+ import librosa
4
+ import soundfile as sf
5
+ import os
6
+ import zipfile
7
+
8
+
9
+ # Function to download audio from YouTube and save it as a WAV file
10
+ def download_youtube_audio(url, audio_name):
11
+ ydl_opts = {
12
+ "format": "bestaudio/best",
13
+ "postprocessors": [
14
+ {
15
+ "key": "FFmpegExtractAudio",
16
+ "preferredcodec": "wav",
17
+ }
18
+ ],
19
+ "outtmpl": f"youtubeaudio/{audio_name}", # Output template
20
+ }
21
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
22
+ ydl.download([url])
23
+ return f"youtubeaudio/{audio_name}.wav"
24
+
25
+
26
+ # Function to calculate RMS
27
+ def get_rms(y, frame_length=2048, hop_length=512, pad_mode="constant"):
28
+ padding = (int(frame_length // 2), int(frame_length // 2))
29
+ y = np.pad(y, padding, mode=pad_mode)
30
+
31
+ axis = -1
32
+ out_strides = y.strides + tuple([y.strides[axis]])
33
+ x_shape_trimmed = list(y.shape)
34
+ x_shape_trimmed[axis] -= frame_length - 1
35
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
36
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
37
+ if axis < 0:
38
+ target_axis = axis - 1
39
+ else:
40
+ target_axis = axis + 1
41
+ xw = np.moveaxis(xw, -1, target_axis)
42
+ slices = [slice(None)] * xw.ndim
43
+ slices[axis] = slice(0, None, hop_length)
44
+ x = xw[tuple(slices)]
45
+
46
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
47
+ return np.sqrt(power)
48
+
49
+
50
+ # Slicer class
51
+ class Slicer:
52
+ def __init__(
53
+ self,
54
+ sr,
55
+ threshold=-40.0,
56
+ min_length=5000,
57
+ min_interval=300,
58
+ hop_size=20,
59
+ max_sil_kept=5000,
60
+ ):
61
+ if not min_length >= min_interval >= hop_size:
62
+ raise ValueError(
63
+ "The following condition must be satisfied: min_length >= min_interval >= hop_size"
64
+ )
65
+ if not max_sil_kept >= hop_size:
66
+ raise ValueError(
67
+ "The following condition must be satisfied: max_sil_kept >= hop_size"
68
+ )
69
+ min_interval = sr * min_interval / 1000
70
+ self.threshold = 10 ** (threshold / 20.0)
71
+ self.hop_size = round(sr * hop_size / 1000)
72
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
73
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
74
+ self.min_interval = round(min_interval / self.hop_size)
75
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
76
+
77
+ def _apply_slice(self, waveform, begin, end):
78
+ if len(waveform.shape) > 1:
79
+ return waveform[
80
+ :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
81
+ ]
82
+ else:
83
+ return waveform[
84
+ begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
85
+ ]
86
+
87
+ def slice(self, waveform):
88
+ if len(waveform.shape) > 1:
89
+ samples = waveform.mean(axis=0)
90
+ else:
91
+ samples = waveform
92
+ if samples.shape[0] <= self.min_length:
93
+ return [waveform]
94
+ rms_list = get_rms(
95
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
96
+ ).squeeze(0)
97
+ sil_tags = []
98
+ silence_start = None
99
+ clip_start = 0
100
+ for i, rms in enumerate(rms_list):
101
+ if rms < self.threshold:
102
+ if silence_start is None:
103
+ silence_start = i
104
+ continue
105
+ if silence_start is None:
106
+ continue
107
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
108
+ need_slice_middle = (
109
+ i - silence_start >= self.min_interval
110
+ and i - clip_start >= self.min_length
111
+ )
112
+ if not is_leading_silence and not need_slice_middle:
113
+ silence_start = None
114
+ continue
115
+ if i - silence_start <= self.max_sil_kept:
116
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
117
+ if silence_start == 0:
118
+ sil_tags.append((0, pos))
119
+ else:
120
+ sil_tags.append((pos, pos))
121
+ clip_start = pos
122
+ elif i - silence_start <= self.max_sil_kept * 2:
123
+ pos = rms_list[
124
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
125
+ ].argmin()
126
+ pos += i - self.max_sil_kept
127
+ pos_l = (
128
+ rms_list[
129
+ silence_start : silence_start + self.max_sil_kept + 1
130
+ ].argmin()
131
+ + silence_start
132
+ )
133
+ pos_r = (
134
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
135
+ + i
136
+ - self.max_sil_kept
137
+ )
138
+ if silence_start == 0:
139
+ sil_tags.append((0, pos_r))
140
+ clip_start = pos_r
141
+ else:
142
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
143
+ clip_start = max(pos_r, pos)
144
+ else:
145
+ pos_l = (
146
+ rms_list[
147
+ silence_start : silence_start + self.max_sil_kept + 1
148
+ ].argmin()
149
+ + silence_start
150
+ )
151
+ pos_r = (
152
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
153
+ + i
154
+ - self.max_sil_kept
155
+ )
156
+ if silence_start == 0:
157
+ sil_tags.append((0, pos_r))
158
+ else:
159
+ sil_tags.append((pos_l, pos_r))
160
+ clip_start = pos_r
161
+ silence_start = None
162
+ total_frames = rms_list.shape[0]
163
+ if (
164
+ silence_start is not None
165
+ and total_frames - silence_start >= self.min_interval
166
+ ):
167
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
168
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
169
+ sil_tags.append((pos, total_frames + 1))
170
+ if len(sil_tags) == 0:
171
+ return [waveform]
172
+ else:
173
+ chunks = []
174
+ if sil_tags[0][0] > 0:
175
+ chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
176
+ for i in range(len(sil_tags) - 1):
177
+ chunks.append(
178
+ self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
179
+ )
180
+ if sil_tags[-1][1] < total_frames:
181
+ chunks.append(
182
+ self._apply_slice(waveform, sil_tags[-1][1], total_frames)
183
+ )
184
+ return chunks
185
+
186
+
187
+ # Function to slice and save audio chunks
188
+ def slice_audio(file_path, audio_name):
189
+ audio, sr = librosa.load(file_path, sr=None, mono=False)
190
+ os.makedirs(f"dataset/{audio_name}", exist_ok=True)
191
+ slicer = Slicer(
192
+ sr=sr,
193
+ threshold=-40,
194
+ min_length=5000,
195
+ min_interval=500,
196
+ hop_size=10,
197
+ max_sil_kept=500,
198
+ )
199
+ chunks = slicer.slice(audio)
200
+ for i, chunk in enumerate(chunks):
201
+ if len(chunk.shape) > 1:
202
+ chunk = chunk.T
203
+ sf.write(f"dataset/{audio_name}/split_{i}.wav", chunk, sr)
204
+ return f"dataset/{audio_name}"
205
+
206
+
207
+ # Function to zip the dataset directory
208
+ def zip_directory(directory_path, audio_name):
209
+ zip_file = f"dataset/{audio_name}.zip"
210
+ os.makedirs(os.path.dirname(zip_file), exist_ok=True) # Ensure the directory exists
211
+ with zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED) as zipf:
212
+ for root, dirs, files in os.walk(directory_path):
213
+ for file in files:
214
+ file_path = os.path.join(root, file)
215
+ arcname = os.path.relpath(file_path, start=directory_path)
216
+ zipf.write(file_path, arcname)
217
+ return zip_file
218
+
219
+
220
+ # Gradio interface
221
+ def process_audio(url, audio_name):
222
+ file_path = download_youtube_audio(url, audio_name)
223
+ dataset_path = slice_audio(file_path, audio_name)
224
+ zip_file = zip_directory(dataset_path, audio_name)
225
+ return zip_file, print(f"{zip_file} successfully processed")
app/infer.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import hashlib
4
+ from pathlib import Path
5
+ from typing import Tuple
6
+ from demucs.separate import main as demucs
7
+ import gradio as gr
8
+ import numpy as np
9
+ import soundfile as sf
10
+ from zerorvc import RVC
11
+ from .zero import zero
12
+ from .model import device
13
+
14
+
15
+ @zero(duration=120)
16
+ def infer(
17
+ exp_dir: str, original_audio: str, pitch_mod: int, protect: float
18
+ ) -> Tuple[int, np.ndarray]:
19
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
20
+ if not os.path.exists(checkpoint_dir):
21
+ raise gr.Error("Model not found")
22
+
23
+ # rename the original audio to the hash
24
+ with open(original_audio, "rb") as f:
25
+ original_audio_hash = hashlib.md5(f.read()).hexdigest()
26
+ ext = Path(original_audio).suffix
27
+ original_audio_hashed = os.path.join(exp_dir, f"{original_audio_hash}{ext}")
28
+ shutil.copy(original_audio, original_audio_hashed)
29
+
30
+ out = os.path.join("separated", "htdemucs", original_audio_hash, "vocals.wav")
31
+ if not os.path.exists(out):
32
+ demucs(
33
+ [
34
+ "--two-stems",
35
+ "vocals",
36
+ "-d",
37
+ str(device),
38
+ "-n",
39
+ "htdemucs",
40
+ original_audio_hashed,
41
+ ]
42
+ )
43
+
44
+ rvc = RVC.from_pretrained(checkpoint_dir)
45
+ samples = rvc.convert(out, pitch_modification=pitch_mod, protect=protect)
46
+ file = os.path.join(exp_dir, "infer.wav")
47
+ sf.write(file, samples, rvc.sr)
48
+
49
+ return file
50
+
51
+
52
+ def merge(exp_dir: str, original_audio: str, vocal: Tuple[int, np.ndarray]) -> str:
53
+ with open(original_audio, "rb") as f:
54
+ original_audio_hash = hashlib.md5(f.read()).hexdigest()
55
+ music = os.path.join("separated", "htdemucs", original_audio_hash, "no_vocals.wav")
56
+
57
+ tmp = os.path.join(exp_dir, "tmp.wav")
58
+ sf.write(tmp, vocal[1], vocal[0])
59
+
60
+ os.system(
61
+ f"ffmpeg -i {music} -i {tmp} -filter_complex '[1]volume=2[a];[0][a]amix=inputs=2:duration=first:dropout_transition=2' -ac 2 -y {tmp}.merged.mp3"
62
+ )
63
+
64
+ return f"{tmp}.merged.mp3"
65
+
66
+
67
+ class InferenceTab:
68
+ def __init__(self):
69
+ pass
70
+
71
+ def ui(self):
72
+ gr.Markdown("# Inference")
73
+ gr.Markdown(
74
+ "After trained model is pruned, you can use it to infer on new music. \n"
75
+ "Upload the original audio and adjust the F0 add value to generate the inferred audio."
76
+ )
77
+
78
+ with gr.Row():
79
+ self.original_audio = gr.Audio(
80
+ label="Upload original audio",
81
+ type="filepath",
82
+ show_download_button=True,
83
+ )
84
+
85
+ with gr.Column():
86
+ self.pitch_mod = gr.Slider(
87
+ label="Pitch Modification +/-",
88
+ minimum=-16,
89
+ maximum=16,
90
+ step=1,
91
+ value=0,
92
+ )
93
+ self.protect = gr.Slider(
94
+ label="Protect",
95
+ minimum=0,
96
+ maximum=0.5,
97
+ step=0.01,
98
+ value=0.33,
99
+ )
100
+
101
+ self.infer_btn = gr.Button(value="Infer", variant="primary")
102
+ with gr.Row():
103
+ self.infer_output = gr.Audio(
104
+ label="Inferred audio", show_download_button=True, format="mp3"
105
+ )
106
+ with gr.Row():
107
+ self.merge_output = gr.Audio(
108
+ label="Merged audio", show_download_button=True, format="mp3"
109
+ )
110
+
111
+ def build(self, exp_dir: gr.Textbox):
112
+ self.infer_btn.click(
113
+ fn=infer,
114
+ inputs=[
115
+ exp_dir,
116
+ self.original_audio,
117
+ self.pitch_mod,
118
+ self.protect,
119
+ ],
120
+ outputs=[self.infer_output],
121
+ ).success(
122
+ fn=merge,
123
+ inputs=[exp_dir, self.original_audio, self.infer_output],
124
+ outputs=[self.merge_output],
125
+ )
app/model.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from accelerate import Accelerator
3
+ from zerorvc import load_hubert, load_rmvpe
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ accelerator = Accelerator()
8
+ device = accelerator.device
9
+
10
+ logger.info(f"device: {device}")
11
+ logger.info(f"mixed_precision: {accelerator.mixed_precision}")
12
+
13
+ rmvpe = load_rmvpe(device=device)
14
+ logger.info("RMVPE model loaded.")
15
+
16
+ hubert = load_hubert(device=device)
17
+ logger.info("HuBERT model loaded.")
app/settings.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .constants import HF_TOKEN
3
+
4
+
5
+ class SettingsTab:
6
+ def __init__(self):
7
+ pass
8
+
9
+ def ui(self):
10
+ self.exp_dir = gr.Textbox(
11
+ label="Temporary Experiment Directory (auto-managed)",
12
+ placeholder="It will be auto-generated after setup",
13
+ interactive=True,
14
+ )
15
+ gr.Markdown(
16
+ "### Sync with Hugging Face 🤗\n\nThe access token will be use to upload/download the dataset and model."
17
+ )
18
+ self.hf_token = gr.Textbox(
19
+ label="Hugging Face Access Token",
20
+ placeholder="Paste your Hugging Face access token here (hf_...)",
21
+ value=HF_TOKEN,
22
+ interactive=True,
23
+ )
24
+
25
+ def build(self):
26
+ pass
app/train.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import gradio as gr
4
+ import torch
5
+ from zerorvc import RVCTrainer, pretrained_checkpoints, SynthesizerTrnMs768NSFsid
6
+ from zerorvc.trainer import TrainingCheckpoint
7
+ from datasets import load_from_disk
8
+ from huggingface_hub import snapshot_download
9
+ from .zero import zero
10
+ from .model import accelerator, device
11
+ from .constants import BATCH_SIZE, ROOT_EXP_DIR, TRAINING_EPOCHS
12
+
13
+
14
+ @zero(duration=240)
15
+ def train_model(exp_dir: str, progress=gr.Progress()):
16
+ dataset = os.path.join(exp_dir, "dataset")
17
+ if not os.path.exists(dataset):
18
+ raise gr.Error("Dataset not found. Please prepare the dataset first.")
19
+
20
+ ds = load_from_disk(dataset)
21
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
22
+ trainer = RVCTrainer(checkpoint_dir)
23
+
24
+ resume_from = trainer.latest_checkpoint()
25
+ if resume_from is None:
26
+ resume_from = pretrained_checkpoints()
27
+ gr.Info(f"Starting training from pretrained checkpoints.")
28
+ else:
29
+ gr.Info(f"Resuming training from {resume_from}")
30
+
31
+ tqdm = progress.tqdm(
32
+ trainer.train(
33
+ dataset=ds["train"],
34
+ resume_from=resume_from,
35
+ batch_size=BATCH_SIZE,
36
+ epochs=TRAINING_EPOCHS,
37
+ accelerator=accelerator,
38
+ ),
39
+ total=TRAINING_EPOCHS,
40
+ unit="epochs",
41
+ desc="Training",
42
+ )
43
+
44
+ for ckpt in tqdm:
45
+ info = f"Epoch: {ckpt.epoch} loss: (gen: {ckpt.loss_gen:.4f}, fm: {ckpt.loss_fm:.4f}, mel: {ckpt.loss_mel:.4f}, kl: {ckpt.loss_kl:.4f}, disc: {ckpt.loss_disc:.4f})"
46
+ print(info)
47
+ latest: TrainingCheckpoint = ckpt
48
+
49
+ latest.save(trainer.checkpoint_dir)
50
+ latest.G.save_pretrained(trainer.checkpoint_dir)
51
+
52
+ result = f"{TRAINING_EPOCHS} epochs trained. Latest loss: (gen: {latest.loss_gen:.4f}, fm: {latest.loss_fm:.4f}, mel: {latest.loss_mel:.4f}, kl: {latest.loss_kl:.4f}, disc: {latest.loss_disc:.4f})"
53
+
54
+ del trainer
55
+ if device.type == "cuda":
56
+ torch.cuda.empty_cache()
57
+
58
+ return result
59
+
60
+
61
+ def upload_model(exp_dir: str, repo: str, hf_token: str):
62
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
63
+ if not os.path.exists(checkpoint_dir):
64
+ raise gr.Error("Model not found")
65
+
66
+ gr.Info("Uploading model")
67
+ model = SynthesizerTrnMs768NSFsid.from_pretrained(checkpoint_dir)
68
+ model.push_to_hub(repo, token=hf_token, private=True)
69
+ gr.Info("Model uploaded successfully")
70
+
71
+
72
+ def upload_checkpoints(exp_dir: str, repo: str, hf_token: str):
73
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
74
+ if not os.path.exists(checkpoint_dir):
75
+ raise gr.Error("Checkpoints not found")
76
+
77
+ gr.Info("Uploading checkpoints")
78
+ trainer = RVCTrainer(checkpoint_dir)
79
+ trainer.push_to_hub(repo, token=hf_token, private=True)
80
+ gr.Info("Checkpoints uploaded successfully")
81
+
82
+
83
+ def fetch_model(exp_dir: str, repo: str, hf_token: str):
84
+ if not exp_dir:
85
+ exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR)
86
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
87
+
88
+ gr.Info("Fetching model")
89
+ files = ["README.md", "config.json", "model.safetensors"]
90
+ snapshot_download(
91
+ repo, token=hf_token, local_dir=checkpoint_dir, allow_patterns=files
92
+ )
93
+ gr.Info("Model fetched successfully")
94
+
95
+ return exp_dir
96
+
97
+
98
+ def fetch_checkpoints(exp_dir: str, repo: str, hf_token: str):
99
+ if not exp_dir:
100
+ exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR)
101
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
102
+
103
+ gr.Info("Fetching checkpoints")
104
+ snapshot_download(repo, token=hf_token, local_dir=checkpoint_dir)
105
+ gr.Info("Checkpoints fetched successfully")
106
+
107
+ return exp_dir
108
+
109
+
110
+ class TrainTab:
111
+ def __init__(self):
112
+ pass
113
+
114
+ def ui(self):
115
+ gr.Markdown("# Training")
116
+ gr.Markdown(
117
+ "You can start training the model by clicking the button below. "
118
+ f"Each time you click the button, the model will train for {TRAINING_EPOCHS} epochs, which takes about 3 minutes on ZeroGPU (A100). "
119
+ )
120
+
121
+ with gr.Row():
122
+ self.train_btn = gr.Button(value="Train", variant="primary")
123
+ self.result = gr.Textbox(label="Training Result", lines=3)
124
+
125
+ gr.Markdown("## Sync Model and Checkpoints with Hugging Face")
126
+ gr.Markdown(
127
+ "You can upload the trained model and checkpoints to Hugging Face for sharing or further training."
128
+ )
129
+
130
+ self.repo = gr.Textbox(label="Repository ID", placeholder="username/repo")
131
+ with gr.Row():
132
+ self.upload_model_btn = gr.Button(value="Upload Model", variant="primary")
133
+ self.upload_checkpoints_btn = gr.Button(
134
+ value="Upload Checkpoints", variant="primary"
135
+ )
136
+ with gr.Row():
137
+ self.fetch_mode_btn = gr.Button(value="Fetch Model", variant="primary")
138
+ self.fetch_checkpoints_btn = gr.Button(
139
+ value="Fetch Checkpoints", variant="primary"
140
+ )
141
+
142
+ def build(self, exp_dir: gr.Textbox, hf_token: gr.Textbox):
143
+ self.train_btn.click(
144
+ fn=train_model,
145
+ inputs=[exp_dir],
146
+ outputs=[self.result],
147
+ )
148
+
149
+ self.upload_model_btn.click(
150
+ fn=upload_model,
151
+ inputs=[exp_dir, self.repo, hf_token],
152
+ )
153
+
154
+ self.upload_checkpoints_btn.click(
155
+ fn=upload_checkpoints,
156
+ inputs=[exp_dir, self.repo, hf_token],
157
+ )
158
+
159
+ self.fetch_mode_btn.click(
160
+ fn=fetch_model,
161
+ inputs=[exp_dir, self.repo, hf_token],
162
+ outputs=[exp_dir],
163
+ )
164
+
165
+ self.fetch_checkpoints_btn.click(
166
+ fn=fetch_checkpoints,
167
+ inputs=[exp_dir, self.repo, hf_token],
168
+ outputs=[exp_dir],
169
+ )
app/tutorial.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ class TutotialTab:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def ui(self):
9
+ gr.Markdown(
10
+ """
11
+ # Welcome to ZeroRVC!
12
+
13
+ > If you are more satisfied with Python code, you can also [use the Python API to run ZeroRVC](https://pypi.org/project/zerorvc/).
14
+
15
+ ZeroRVC is a toolkit for training and inference of retrieval-based voice conversion models.
16
+
17
+ By leveraging the power of Hugging Face ZeroGPU, you can train your model in minutes without setting up the environment.
18
+
19
+ ## How to Use
20
+
21
+ There are 3 main steps to use ZeroRVC:
22
+
23
+ - **Make Dataset**: Prepare your dataset for training. You can upload a zip file containing audio files.
24
+ - **Model Training**: Train your model using the prepared dataset.
25
+ - **Model Inference**: Try your model.
26
+ """
27
+ )
28
+
29
+ def build(self):
30
+ pass
app/zero.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ zero_is_available = "SPACES_ZERO_GPU" in os.environ
7
+
8
+ if zero_is_available:
9
+ import spaces # type: ignore
10
+
11
+ logger.info("ZeroGPU is available")
12
+ else:
13
+ logger.info("ZeroGPU is not available")
14
+
15
+
16
+ # a decorator that applies the spaces.GPU decorator if zero is available
17
+ def zero(duration=60):
18
+ def wrapper(func):
19
+ if zero_is_available:
20
+ return spaces.GPU(func, duration=duration)
21
+ else:
22
+ return func
23
+
24
+ return wrapper
example-dataset.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from zerorvc import prepare
3
+
4
+ HF_TOKEN = os.environ.get("HF_TOKEN")
5
+
6
+ dataset = prepare("./my-voices")
7
+ print(dataset)
8
+
9
+ dataset.push_to_hub("my-rvc-dataset", token=HF_TOKEN, private=True)
example-infer.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from zerorvc import RVC
3
+ import soundfile as sf
4
+
5
+ HF_TOKEN = os.environ.get("HF_TOKEN")
6
+ MODEL = "JacobLinCool/my-rvc-model3"
7
+
8
+ rvc = RVC.from_pretrained(MODEL, token=HF_TOKEN)
9
+ samples = rvc.convert("test.mp3")
10
+ sf.write("output.wav", samples, rvc.sr)
11
+
12
+ pitch_modifications = [-12, -8, -4, 4, 8, 12]
13
+ for pitch_modification in pitch_modifications:
14
+ samples = rvc.convert("test.mp3", pitch_modification=pitch_modification)
15
+ sf.write(f"output-{pitch_modification}.wav", samples, rvc.sr)
example-train.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datasets import load_dataset
3
+ from tqdm import tqdm
4
+ from zerorvc import RVCTrainer, pretrained_checkpoints
5
+
6
+ HF_TOKEN = os.environ.get("HF_TOKEN")
7
+ EPOCHS = 100
8
+ BATCH_SIZE = 8
9
+ DATASET = "JacobLinCool/my-rvc-dataset"
10
+ MODEL = "JacobLinCool/my-rvc-model"
11
+
12
+ dataset = load_dataset(DATASET, token=HF_TOKEN)
13
+ print(dataset)
14
+
15
+ trainer = RVCTrainer(checkpoint_dir="./checkpoints")
16
+ training = tqdm(
17
+ trainer.train(
18
+ dataset=dataset["train"],
19
+ resume_from=pretrained_checkpoints(), # resume training from the pretrained VCTK checkpoint
20
+ epochs=EPOCHS,
21
+ batch_size=BATCH_SIZE,
22
+ ),
23
+ total=EPOCHS,
24
+ )
25
+
26
+ # Training loop: iterate over epochs
27
+ for checkpoint in training:
28
+ training.set_description(
29
+ f"Epoch {checkpoint.epoch}/{EPOCHS} loss: (gen: {checkpoint.loss_gen:.4f}, fm: {checkpoint.loss_fm:.4f}, mel: {checkpoint.loss_mel:.4f}, kl: {checkpoint.loss_kl:.4f}, disc: {checkpoint.loss_disc:.4f})"
30
+ )
31
+
32
+ # Save checkpoint every 10 epochs
33
+ if checkpoint.epoch % 10 == 0:
34
+ checkpoint.save(checkpoint_dir=trainer.checkpoint_dir)
35
+ # Directly push the synthesizer to the Hugging Face Hub
36
+ checkpoint.G.push_to_hub(MODEL, token=HF_TOKEN, private=True)
37
+
38
+ print("Training completed.")
headers.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ title: ZeroRVC
2
+ emoji: 🎙️
3
+ colorFrom: gray
4
+ colorTo: gray
5
+ sdk: gradio
6
+ sdk_version: 4.37.2
7
+ app_file: app.py
8
+ pinned: false
my-voices/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.wav
pyproject.toml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "zerorvc"
3
+ version = "0.0.8"
4
+ authors = [{ name = "Jacob Lin", email = "[email protected]" }]
5
+ description = "Run Retrieval-based Voice Conversion training and inference with ease."
6
+ readme = "README.md"
7
+ requires-python = ">=3.8"
8
+ classifiers = [
9
+ "Programming Language :: Python :: 3",
10
+ "License :: OSI Approved :: MIT License",
11
+ "Operating System :: OS Independent",
12
+ ]
13
+ dependencies = [
14
+ "numpy>=1.0.0",
15
+ "torch>=2.0.0",
16
+ "datasets",
17
+ "accelerate",
18
+ "huggingface_hub",
19
+ "tqdm",
20
+ "fairseq",
21
+ "librosa",
22
+ "scipy",
23
+ ]
24
+
25
+ [project.urls]
26
+ Homepage = "https://github.com/jacoblincool/zero-rvc"
27
+ Issues = "https://github.com/jacoblincool/zero-rvc/issues"
28
+
29
+ [build-system]
30
+ requires = ["hatchling"]
31
+ build-backend = "hatchling.build"
32
+
33
+ [tool.hatch.build.targets.sdist]
34
+ include = ["zerorvc/**/*", "pyproject.toml", "README.md", "LICENSE"]
35
+ [tool.hatch.build.targets.wheel]
36
+ packages = ["zerorvc"]
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ zerorvc>=0.0.8
2
+
3
+ # gradio app deps
4
+ gradio==4.37.2
5
+ demucs==4.0.1
6
+ yt_dlp
zerorvc/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from .rvc import RVC
2
+ from .trainer import RVCTrainer
3
+ from .dataset import prepare
4
+ from .synthesizer import SynthesizerTrnMs768NSFsid
5
+ from .pretrained import pretrained_checkpoints
6
+ from .f0 import load_rmvpe, RMVPE, F0Extractor
7
+ from .hubert import load_hubert, HubertModel, HubertFeatureExtractor
8
+ from .auto_loader import auto_loaded_model
zerorvc/assets/mute/mute48k.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f2bb4daaa106e351aebb001e5a25de985c0b472f22e8d60676bc924a79056ee
3
+ size 288078
zerorvc/auto_loader.py ADDED
@@ -0,0 +1 @@
 
 
1
+ auto_loaded_model = {}
zerorvc/constants.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ SR_16K = 16000
2
+ SR_48K = 48000
3
+
4
+ N_FFT = 2048
5
+ HOP_LENGTH = 480
6
+ WIN_LENGTH = 2048
7
+ N_MELS = 128
zerorvc/dataset.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import torch
4
+ import librosa
5
+ import logging
6
+ import shutil
7
+ from pkg_resources import resource_filename
8
+ from accelerate import Accelerator
9
+ from datasets import load_dataset, DatasetDict, Audio
10
+ from .preprocess import Preprocessor, crop_feats_length
11
+ from .hubert import HubertFeatureExtractor, HubertModel, load_hubert
12
+ from .f0 import F0Extractor, RMVPE, load_rmvpe
13
+ from .constants import *
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def extract_hubert_features(
20
+ rows,
21
+ hfe: HubertFeatureExtractor,
22
+ hubert: str | HubertModel | None,
23
+ device: torch.device,
24
+ ):
25
+ if not hfe.is_loaded():
26
+ model = load_hubert(hubert, device)
27
+ hfe.load(model)
28
+ feats = []
29
+ for row in rows["wav_16k"]:
30
+ feat = hfe.extract_feature_from(row["array"].astype("float32"))
31
+ feats.append(feat)
32
+ return {"hubert_feats": feats}
33
+
34
+
35
+ def extract_f0_features(
36
+ rows, f0e: F0Extractor, rmvpe: str | RMVPE | None, device: torch.device
37
+ ):
38
+ if not f0e.is_loaded():
39
+ model = load_rmvpe(rmvpe, device)
40
+ f0e.load(model)
41
+ f0s = []
42
+ f0nsfs = []
43
+ for row in rows["wav_16k"]:
44
+ f0nsf, f0 = f0e.extract_f0_from(row["array"].astype("float32"))
45
+ f0s.append(f0)
46
+ f0nsfs.append(f0nsf)
47
+ return {"f0": f0s, "f0nsf": f0nsfs}
48
+
49
+
50
+ def feature_postprocess(rows):
51
+ phones = rows["hubert_feats"]
52
+ for i, phone in enumerate(phones):
53
+ phone = np.repeat(phone, 2, axis=0)
54
+ n_num = min(phone.shape[0], 900)
55
+ phone = phone[:n_num, :]
56
+ phones[i] = phone
57
+
58
+ if "f0" in rows:
59
+ pitch = rows["f0"][i]
60
+ pitch = pitch[:n_num]
61
+ pitch = np.array(pitch, dtype=np.float32)
62
+ rows["f0"][i] = pitch
63
+ if "f0nsf" in rows:
64
+ pitchf = rows["f0nsf"][i]
65
+ pitchf = pitchf[:n_num]
66
+ rows["f0nsf"][i] = pitchf
67
+ return rows
68
+
69
+
70
+ def calculate_spectrogram(
71
+ rows, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH
72
+ ):
73
+ specs = []
74
+ hann_window = np.hanning(win_length)
75
+ pad_amount = int((win_length - hop_length) / 2)
76
+ for row in rows["wav_gt"]:
77
+ stft = librosa.stft(
78
+ np.pad(row["array"], (pad_amount, pad_amount), mode="reflect"),
79
+ n_fft=n_fft,
80
+ hop_length=hop_length,
81
+ win_length=win_length,
82
+ window=hann_window,
83
+ center=False,
84
+ )
85
+ specs.append(np.abs(stft) + 1e-6)
86
+
87
+ return {"spec": specs}
88
+
89
+
90
+ def fix_length(rows, hop_length=HOP_LENGTH):
91
+ for i, row in enumerate(rows["spec"]):
92
+ spec = np.array(row)
93
+ phone = np.array(rows["hubert_feats"][i])
94
+ pitch = np.array(rows["f0"][i])
95
+ pitchf = np.array(rows["f0nsf"][i])
96
+ wav_gt = np.array(rows["wav_gt"][i]["array"])
97
+
98
+ spec, phone, pitch, pitchf = crop_feats_length(spec, phone, pitch, pitchf)
99
+
100
+ phone_len = phone.shape[0]
101
+ wav_gt = wav_gt[: phone_len * hop_length]
102
+
103
+ rows["hubert_feats"][i] = phone
104
+ rows["f0"][i] = pitch
105
+ rows["f0nsf"][i] = pitchf
106
+ rows["spec"][i] = spec
107
+ rows["wav_gt"][i]["array"] = wav_gt
108
+ return rows
109
+
110
+
111
+ def prepare(
112
+ dir: str | DatasetDict,
113
+ sr=SR_48K,
114
+ hubert: str | HubertModel | None = None,
115
+ rmvpe: str | RMVPE | None = None,
116
+ batch_size=1,
117
+ accelerator: Accelerator = None,
118
+ include_mute=True,
119
+ stage=3,
120
+ ):
121
+ """
122
+ Prepare the dataset for training or evaluation.
123
+
124
+ Args:
125
+ dir (str | DatasetDict): The directory path or DatasetDict object containing the dataset.
126
+ sr (int, optional): The target sampling rate. Defaults to SR_48K.
127
+ hubert (str | HubertModel | None, optional): The Hubert model or its name to use for feature extraction. Defaults to None.
128
+ rmvpe (str | RMVPE | None, optional): The RMVPE model or its name to use for feature extraction. Defaults to None.
129
+ batch_size (int, optional): The batch size for processing the dataset. Defaults to 1.
130
+ accelerator (Accelerator, optional): The accelerator object for distributed training. Defaults to None.
131
+ include_mute (bool, optional): Whether to include a mute audio file in the directory dataset. Defaults to True.
132
+ stage (int, optional): The dataset preparation level to perform. Defaults to 3. (Stage 1 and 3 are CPU intensive, Stage 2 is GPU intensive.)
133
+
134
+ Returns:
135
+ DatasetDict: The prepared dataset.
136
+ """
137
+ if accelerator is None:
138
+ accelerator = Accelerator()
139
+
140
+ if isinstance(dir, DatasetDict):
141
+ ds = dir
142
+ else:
143
+ mute_source = resource_filename("zerorvc", "assets/mute/mute48k.wav")
144
+ mute_dest = os.path.join(dir, "mute.wav")
145
+ if include_mute and not os.path.exists(mute_dest):
146
+ logger.info(f"Copying {mute_source} to {mute_dest}")
147
+ shutil.copy(mute_source, mute_dest)
148
+
149
+ ds: DatasetDict = load_dataset("audiofolder", data_dir=dir)
150
+ ds = ds.cast_column("audio", Audio(sampling_rate=sr))
151
+
152
+ if stage <= 0:
153
+ return ds
154
+
155
+ # Stage 1, CPU intensive
156
+
157
+ pp = Preprocessor(sr, 3.0)
158
+
159
+ def preprocess(rows):
160
+ wav_gt = []
161
+ wav_16k = []
162
+ for row in rows["audio"]:
163
+ slices = pp.preprocess_audio(row["array"])
164
+ for slice in slices:
165
+ wav_gt.append({"path": "", "array": slice, "sampling_rate": sr})
166
+ slice16k = librosa.resample(slice, orig_sr=sr, target_sr=SR_16K)
167
+ wav_16k.append({"path": "", "array": slice16k, "sampling_rate": SR_16K})
168
+ return {"wav_gt": wav_gt, "wav_16k": wav_16k}
169
+
170
+ ds = ds.map(
171
+ preprocess, batched=True, batch_size=batch_size, remove_columns=["audio"]
172
+ )
173
+ ds = ds.cast_column("wav_gt", Audio(sampling_rate=sr))
174
+ ds = ds.cast_column("wav_16k", Audio(sampling_rate=SR_16K))
175
+
176
+ if stage <= 1:
177
+ return ds
178
+
179
+ # Stage 2, GPU intensive
180
+
181
+ hfe = HubertFeatureExtractor()
182
+ ds = ds.map(
183
+ extract_hubert_features,
184
+ batched=True,
185
+ batch_size=batch_size,
186
+ fn_kwargs={"hfe": hfe, "hubert": hubert, "device": accelerator.device},
187
+ )
188
+
189
+ f0e = F0Extractor()
190
+ ds = ds.map(
191
+ extract_f0_features,
192
+ batched=True,
193
+ batch_size=batch_size,
194
+ fn_kwargs={"f0e": f0e, "rmvpe": rmvpe, "device": accelerator.device},
195
+ )
196
+
197
+ if stage <= 2:
198
+ return ds
199
+
200
+ # Stage 3, CPU intensive
201
+
202
+ ds = ds.map(feature_postprocess, batched=True, batch_size=batch_size)
203
+ ds = ds.map(calculate_spectrogram, batched=True, batch_size=batch_size)
204
+ ds = ds.map(fix_length, batched=True, batch_size=batch_size)
205
+
206
+ return ds
zerorvc/f0/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .extractor import F0Extractor
2
+ from .rmvpe import RMVPE
3
+ from .load import load_rmvpe
zerorvc/f0/extractor.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import numpy as np
3
+ import librosa
4
+ from .rmvpe import RMVPE
5
+ from ..constants import SR_16K
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class F0Extractor:
11
+ def __init__(
12
+ self,
13
+ rmvpe: RMVPE = None,
14
+ sr=SR_16K,
15
+ f0_bin=256,
16
+ f0_max=1100.0,
17
+ f0_min=50.0,
18
+ ):
19
+ self.sr = sr
20
+ self.f0_bin = f0_bin
21
+ self.f0_max = f0_max
22
+ self.f0_min = f0_min
23
+ self.f0_mel_min = 1127 * np.log(1 + f0_min / 700)
24
+ self.f0_mel_max = 1127 * np.log(1 + f0_max / 700)
25
+
26
+ if rmvpe is not None:
27
+ self.load(rmvpe)
28
+
29
+ def load(self, rmvpe: RMVPE):
30
+ self.rmvpe = rmvpe
31
+ self.device = next(rmvpe.parameters()).device
32
+ logger.info(f"RMVPE model is on {self.device}")
33
+
34
+ def is_loaded(self) -> bool:
35
+ return hasattr(self, "rmvpe")
36
+
37
+ def calculate_f0_from_f0nsf(self, f0nsf: np.ndarray):
38
+ f0_mel = 1127 * np.log(1 + f0nsf / 700)
39
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
40
+ self.f0_bin - 2
41
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
42
+
43
+ # use 0 or 1
44
+ f0_mel[f0_mel <= 1] = 1
45
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
46
+ f0 = np.rint(f0_mel).astype(int)
47
+ assert f0.max() <= 255 and f0.min() >= 1, (
48
+ f0.max(),
49
+ f0.min(),
50
+ )
51
+
52
+ return f0
53
+
54
+ def extract_f0_from(self, y: np.ndarray, modification=0.0):
55
+ f0nsf = self.rmvpe.infer_from_audio(y, thred=0.03)
56
+
57
+ f0nsf *= pow(2, modification / 12)
58
+
59
+ f0 = self.calculate_f0_from_f0nsf(f0nsf)
60
+
61
+ return f0nsf, f0
62
+
63
+ def extract_f0(self, wav_file: str):
64
+ y, _ = librosa.load(wav_file, sr=self.sr)
65
+ return self.extract_f0_from(y)
zerorvc/f0/load.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from huggingface_hub import hf_hub_download
3
+ from .rmvpe import RMVPE
4
+ from ..auto_loader import auto_loaded_model
5
+
6
+
7
+ def load_rmvpe(
8
+ rmvpe: str | RMVPE | None = None, device: torch.device = torch.device("cpu")
9
+ ) -> RMVPE:
10
+ """
11
+ Load the RMVPE model from a file or download it if necessary.
12
+ If a loaded model is provided, it will be returned as is.
13
+
14
+ Args:
15
+ rmvpe (str | RMVPE | None): The path to the RMVPE model file or the pre-loaded RMVPE model. If None, the default model will be downloaded.
16
+ device (torch.device): The device to load the model on.
17
+
18
+ Returns:
19
+ RMVPE: The loaded RMVPE model.
20
+
21
+ Raises:
22
+ If the model file does not exist.
23
+ """
24
+ if isinstance(rmvpe, RMVPE):
25
+ return rmvpe.to(device)
26
+ if isinstance(rmvpe, str):
27
+ model = RMVPE(4, 1, (2, 2))
28
+ model.load_state_dict(torch.load(rmvpe, map_location=device))
29
+ model.to(device)
30
+ return model
31
+ if "rmvpe" not in auto_loaded_model:
32
+ rmvpe = hf_hub_download("lj1995/VoiceConversionWebUI", "rmvpe.pt")
33
+ model = RMVPE(4, 1, (2, 2))
34
+ model.load_state_dict(torch.load(rmvpe, map_location="cpu"))
35
+ model.to(device)
36
+ auto_loaded_model["rmvpe"] = model
37
+ return auto_loaded_model["rmvpe"]
zerorvc/f0/rmvpe/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # The RMVPE model is from https://github.com/Dream-High/RMVPE
2
+ # Apache License 2.0: https://github.com/Dream-High/RMVPE/blob/main/LICENSE
3
+ # With modifications from https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer/lib/rmvpe.py
4
+ # MIT License: https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/LICENSE
5
+
6
+ from .model import RMVPE
zerorvc/f0/rmvpe/constants.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ N_CLASS = 360
2
+ N_MELS = 128
3
+ MAGIC_CONST = 1997.3794084376191
4
+ SAMPLE_RATE = 16000
5
+ WINDOW_LENGTH = 1024
6
+ HOP_LENGTH = 160
7
+ MEL_FMIN = 30
8
+ MEL_FMAX = SAMPLE_RATE // 2
zerorvc/f0/rmvpe/deepunet.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import torch
3
+ from torch import nn
4
+ from .constants import *
5
+
6
+
7
+ class ConvBlockRes(nn.Module):
8
+ def __init__(self, in_channels: int, out_channels: int, momentum=0.01):
9
+ super().__init__()
10
+ self.conv = nn.Sequential(
11
+ nn.Conv2d(
12
+ in_channels=in_channels,
13
+ out_channels=out_channels,
14
+ kernel_size=(3, 3),
15
+ stride=(1, 1),
16
+ padding=(1, 1),
17
+ bias=False,
18
+ ),
19
+ nn.BatchNorm2d(out_channels, momentum=momentum),
20
+ nn.ReLU(),
21
+ nn.Conv2d(
22
+ in_channels=out_channels,
23
+ out_channels=out_channels,
24
+ kernel_size=(3, 3),
25
+ stride=(1, 1),
26
+ padding=(1, 1),
27
+ bias=False,
28
+ ),
29
+ nn.BatchNorm2d(out_channels, momentum=momentum),
30
+ nn.ReLU(),
31
+ )
32
+ # self.shortcut:Optional[nn.Module] = None
33
+ if in_channels != out_channels:
34
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
35
+
36
+ def forward(self, x: torch.Tensor):
37
+ if not hasattr(self, "shortcut"):
38
+ return self.conv(x) + x
39
+ else:
40
+ return self.conv(x) + self.shortcut(x)
41
+
42
+
43
+ class Encoder(nn.Module):
44
+ def __init__(
45
+ self,
46
+ in_channels: int,
47
+ in_size: int,
48
+ n_encoders: int,
49
+ kernel_size: int,
50
+ n_blocks: int,
51
+ out_channels=16,
52
+ momentum=0.01,
53
+ ):
54
+ super().__init__()
55
+ self.n_encoders = n_encoders
56
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
57
+ self.layers = nn.ModuleList()
58
+ self.latent_channels = []
59
+ for i in range(self.n_encoders):
60
+ self.layers.append(
61
+ ResEncoderBlock(
62
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
63
+ )
64
+ )
65
+ self.latent_channels.append([out_channels, in_size])
66
+ in_channels = out_channels
67
+ out_channels *= 2
68
+ in_size //= 2
69
+ self.out_size = in_size
70
+ self.out_channel = out_channels
71
+
72
+ def forward(self, x: torch.Tensor):
73
+ concat_tensors: List[torch.Tensor] = []
74
+ x = self.bn(x)
75
+ for i, layer in enumerate(self.layers):
76
+ t, x = layer(x)
77
+ concat_tensors.append(t)
78
+ return x, concat_tensors
79
+
80
+
81
+ class ResEncoderBlock(nn.Module):
82
+ def __init__(
83
+ self,
84
+ in_channels: int,
85
+ out_channels: int,
86
+ kernel_size: int,
87
+ n_blocks=1,
88
+ momentum=0.01,
89
+ ):
90
+ super().__init__()
91
+ self.n_blocks = n_blocks
92
+ self.conv = nn.ModuleList()
93
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
94
+ for i in range(n_blocks - 1):
95
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
96
+ self.kernel_size = kernel_size
97
+ if self.kernel_size is not None:
98
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
99
+
100
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
101
+ for i, conv in enumerate(self.conv):
102
+ x = conv(x)
103
+ if self.kernel_size is not None:
104
+ return x, self.pool(x)
105
+ else:
106
+ return x
107
+
108
+
109
+ class Intermediate(nn.Module): #
110
+ def __init__(
111
+ self,
112
+ in_channels: int,
113
+ out_channels: int,
114
+ n_inters: int,
115
+ n_blocks: int,
116
+ momentum=0.01,
117
+ ):
118
+ super().__init__()
119
+ self.n_inters = n_inters
120
+ self.layers = nn.ModuleList()
121
+ self.layers.append(
122
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
123
+ )
124
+ for i in range(self.n_inters - 1):
125
+ self.layers.append(
126
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
127
+ )
128
+
129
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
130
+ for i, layer in enumerate(self.layers):
131
+ x = layer(x)
132
+ return x
133
+
134
+
135
+ class ResDecoderBlock(nn.Module):
136
+ def __init__(
137
+ self,
138
+ in_channels: int,
139
+ out_channels: int,
140
+ stride: int,
141
+ n_blocks=1,
142
+ momentum=0.01,
143
+ ):
144
+ super().__init__()
145
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
146
+ self.n_blocks = n_blocks
147
+ self.conv1 = nn.Sequential(
148
+ nn.ConvTranspose2d(
149
+ in_channels=in_channels,
150
+ out_channels=out_channels,
151
+ kernel_size=(3, 3),
152
+ stride=stride,
153
+ padding=(1, 1),
154
+ output_padding=out_padding,
155
+ bias=False,
156
+ ),
157
+ nn.BatchNorm2d(out_channels, momentum=momentum),
158
+ nn.ReLU(),
159
+ )
160
+ self.conv2 = nn.ModuleList()
161
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
162
+ for i in range(n_blocks - 1):
163
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
164
+
165
+ def forward(self, x: torch.Tensor, concat_tensor: torch.Tensor) -> torch.Tensor:
166
+ x = self.conv1(x)
167
+ x = torch.cat((x, concat_tensor), dim=1)
168
+ for i, conv2 in enumerate(self.conv2):
169
+ x = conv2(x)
170
+ return x
171
+
172
+
173
+ class Decoder(nn.Module):
174
+ def __init__(
175
+ self,
176
+ in_channels: int,
177
+ n_decoders: int,
178
+ stride: int,
179
+ n_blocks: int,
180
+ momentum=0.01,
181
+ ):
182
+ super().__init__()
183
+ self.layers = nn.ModuleList()
184
+ self.n_decoders = n_decoders
185
+ for i in range(self.n_decoders):
186
+ out_channels = in_channels // 2
187
+ self.layers.append(
188
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
189
+ )
190
+ in_channels = out_channels
191
+
192
+ def forward(
193
+ self, x: torch.Tensor, concat_tensors: List[torch.Tensor]
194
+ ) -> torch.Tensor:
195
+ for i, layer in enumerate(self.layers):
196
+ x = layer(x, concat_tensors[-1 - i])
197
+ return x
198
+
199
+
200
+ class DeepUnet(nn.Module):
201
+ def __init__(
202
+ self,
203
+ kernel_size: int,
204
+ n_blocks: int,
205
+ en_de_layers=5,
206
+ inter_layers=4,
207
+ in_channels=1,
208
+ en_out_channels=16,
209
+ ):
210
+ super().__init__()
211
+ self.encoder = Encoder(
212
+ in_channels, N_MELS, en_de_layers, kernel_size, n_blocks, en_out_channels
213
+ )
214
+ self.intermediate = Intermediate(
215
+ self.encoder.out_channel // 2,
216
+ self.encoder.out_channel,
217
+ inter_layers,
218
+ n_blocks,
219
+ )
220
+ self.decoder = Decoder(
221
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
222
+ )
223
+
224
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
225
+ x, concat_tensors = self.encoder(x)
226
+ x = self.intermediate(x)
227
+ x = self.decoder(x, concat_tensors)
228
+ return x
zerorvc/f0/rmvpe/mel.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+ import librosa
5
+
6
+
7
+ class MelSpectrogram(nn.Module):
8
+ def __init__(
9
+ self,
10
+ n_mel_channels: int,
11
+ sampling_rate: int,
12
+ win_length: int,
13
+ hop_length: int,
14
+ n_fft: int = None,
15
+ mel_fmin: int = 0,
16
+ mel_fmax: int = None,
17
+ clamp: float = 1e-5,
18
+ ):
19
+ super().__init__()
20
+ n_fft = win_length if n_fft is None else n_fft
21
+ mel_basis = librosa.filters.mel(
22
+ sr=sampling_rate,
23
+ n_fft=n_fft,
24
+ n_mels=n_mel_channels,
25
+ fmin=mel_fmin,
26
+ fmax=mel_fmax,
27
+ htk=True,
28
+ )
29
+ mel_basis = torch.from_numpy(mel_basis).float()
30
+ self.register_buffer("mel_basis", mel_basis, persistent=False)
31
+ self.n_fft = n_fft
32
+ self.hop_length = hop_length
33
+ self.win_length = win_length
34
+ self.sampling_rate = sampling_rate
35
+ self.n_mel_channels = n_mel_channels
36
+ self.clamp = clamp
37
+
38
+ self.keyshift = 0
39
+ self.speed = 1
40
+ self.factor = 2 ** (self.keyshift / 12)
41
+ self.n_fft_new = int(np.round(self.n_fft * self.factor))
42
+ self.win_length_new = int(np.round(self.win_length * self.factor))
43
+ self.hop_length_new = int(np.round(self.hop_length * self.speed))
44
+ hann_window_0 = torch.hann_window(self.win_length_new)
45
+ self.register_buffer("hann_window_0", hann_window_0, persistent=False)
46
+
47
+ def forward(self, audio: torch.Tensor, center=True):
48
+ fft = torch.stft(
49
+ audio,
50
+ n_fft=self.n_fft_new,
51
+ hop_length=self.hop_length_new,
52
+ win_length=self.win_length_new,
53
+ window=self.hann_window_0,
54
+ center=center,
55
+ return_complex=True,
56
+ )
57
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
58
+ mel_output = torch.matmul(self.mel_basis, magnitude)
59
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
60
+ return log_mel_spec
zerorvc/f0/rmvpe/model.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ import numpy as np
6
+ from .seq import BiGRU
7
+ from .deepunet import DeepUnet
8
+ from .mel import MelSpectrogram
9
+ from .constants import *
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class RMVPE(nn.Module):
15
+ def __init__(
16
+ self,
17
+ n_blocks: int,
18
+ n_gru: int,
19
+ kernel_size: int,
20
+ en_de_layers=5,
21
+ inter_layers=4,
22
+ in_channels=1,
23
+ en_out_channels=16,
24
+ ):
25
+ super().__init__()
26
+ self.device = torch.device("cpu")
27
+ self.mel_extractor = MelSpectrogram(
28
+ N_MELS, SAMPLE_RATE, WINDOW_LENGTH, HOP_LENGTH, None, MEL_FMIN, MEL_FMAX
29
+ )
30
+ self.unet = DeepUnet(
31
+ kernel_size,
32
+ n_blocks,
33
+ en_de_layers,
34
+ inter_layers,
35
+ in_channels,
36
+ en_out_channels,
37
+ )
38
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
39
+ if n_gru:
40
+ self.fc = nn.Sequential(
41
+ BiGRU(3 * N_MELS, 256, n_gru),
42
+ nn.Linear(512, N_CLASS),
43
+ nn.Dropout(0.25),
44
+ nn.Sigmoid(),
45
+ )
46
+ else:
47
+ self.fc = nn.Sequential(
48
+ nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
49
+ )
50
+
51
+ cents_mapping = 20 * np.arange(360) + MAGIC_CONST
52
+ self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
53
+
54
+ def forward(self, mel: torch.Tensor) -> torch.Tensor:
55
+ mel = mel.transpose(-1, -2).unsqueeze(1)
56
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
57
+ x = self.fc(x)
58
+ return x
59
+
60
+ def to(self, device):
61
+ self.device = device
62
+ return super().to(device)
63
+
64
+ def mel2hidden(self, mel: torch.Tensor):
65
+ with torch.no_grad():
66
+ n_frames = mel.shape[-1]
67
+ n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
68
+ if n_pad > 0:
69
+ mel = F.pad(mel, (0, n_pad), mode="constant")
70
+ # mel = mel.half() if self.is_half else mel.float()
71
+ hidden = self(mel)
72
+ return hidden[:, :n_frames]
73
+
74
+ def decode(self, hidden: np.ndarray, thred=0.03):
75
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
76
+ f0 = 10 * (2 ** (cents_pred / 1200))
77
+ f0[f0 == 10] = 0
78
+ # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
79
+ return f0
80
+
81
+ def infer(self, audio: torch.Tensor, thred=0.03):
82
+ mel = self.mel_extractor(audio.unsqueeze(0), center=True)
83
+ hidden = self.mel2hidden(mel)
84
+ hidden = hidden[0]
85
+ f0 = self.decode(hidden.float().cpu(), thred=thred)
86
+ return f0
87
+
88
+ def infer_from_audio(self, audio: np.ndarray, thred=0.03):
89
+ audio = torch.from_numpy(audio).to(self.device)
90
+ return self.infer(audio, thred=thred)
91
+
92
+ def to_local_average_cents(self, salience: np.ndarray, thred=0.05) -> np.ndarray:
93
+ center = np.argmax(salience, axis=1) # 帧长#index
94
+ salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
95
+
96
+ center += 4
97
+ todo_salience = []
98
+ todo_cents_mapping = []
99
+ starts = center - 4
100
+ ends = center + 5
101
+ for idx in range(salience.shape[0]):
102
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
103
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
104
+
105
+ todo_salience = np.array(todo_salience) # 帧长,9
106
+ todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9
107
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
108
+ weight_sum = np.sum(todo_salience, 1) # 帧长
109
+ devided = product_sum / weight_sum # 帧长
110
+
111
+ maxx = np.max(salience, axis=1) # 帧长
112
+ devided[maxx <= thred] = 0
113
+ return devided
zerorvc/f0/rmvpe/seq.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ class BiGRU(nn.Module):
6
+ def __init__(self, input_features: int, hidden_features: int, num_layers: int):
7
+ super().__init__()
8
+ self.gru = nn.GRU(
9
+ input_features,
10
+ hidden_features,
11
+ num_layers=num_layers,
12
+ batch_first=True,
13
+ bidirectional=True,
14
+ )
15
+ self.gru.flatten_parameters()
16
+
17
+ def forward(self, x: torch.Tensor):
18
+ return self.gru(x)[0]
zerorvc/hubert/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .extractor import HubertFeatureExtractor, HubertModel
2
+ from .load import load_hubert
zerorvc/hubert/extractor.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import torch
3
+ import librosa
4
+ import numpy as np
5
+ from fairseq.models.hubert import HubertModel
6
+ from ..constants import SR_16K
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class HubertFeatureExtractor:
12
+ def __init__(self, hubert: HubertModel = None, sr=SR_16K):
13
+ self.sr = sr
14
+ if hubert is not None:
15
+ self.load(hubert)
16
+
17
+ def load(self, hubert: HubertModel):
18
+ self.hubert = hubert
19
+ self.device = next(hubert.parameters()).device
20
+ logger.info(f"HuBERT model is on {self.device}")
21
+
22
+ def is_loaded(self) -> bool:
23
+ return hasattr(self, "hubert")
24
+
25
+ def extract_feature_from(self, y: np.ndarray) -> np.ndarray:
26
+ feats = torch.tensor(y).unsqueeze(0).to(self.device)
27
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False).to(self.device)
28
+ inputs = {
29
+ "source": feats,
30
+ "padding_mask": padding_mask,
31
+ "output_layer": 12,
32
+ }
33
+ with torch.no_grad():
34
+ logits = self.hubert.extract_features(**inputs)
35
+ feats = logits[0].squeeze(0).float().cpu().numpy()
36
+ if np.isnan(feats).sum() > 0:
37
+ feats = np.nan_to_num(feats)
38
+ return feats
39
+
40
+ def extract_feature(self, wav_file: str) -> np.ndarray:
41
+ y, _ = librosa.load(wav_file, sr=self.sr)
42
+ return self.extract_feature_from(y)
zerorvc/hubert/load.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task
3
+ from fairseq.models.hubert import HubertModel
4
+ from huggingface_hub import hf_hub_download
5
+ from ..auto_loader import auto_loaded_model
6
+
7
+
8
+ def load_hubert(
9
+ hubert: str | HubertModel | None = None,
10
+ device: torch.device = torch.device("cpu"),
11
+ ) -> HubertModel:
12
+ """
13
+ Load the Hubert model from a file or download it if necessary.
14
+ If a loaded model is provided, it will be returned as is.
15
+
16
+ Args:
17
+ hubert (str | HubertModel | None): The path to the Hubert model file or the pre-loaded Hubert model. If None, the default model will be downloaded.
18
+ device (torch.device): The device to load the model on.
19
+
20
+ Returns:
21
+ HubertModel: The loaded Hubert model.
22
+
23
+ Raises:
24
+ If the model file does not exist.
25
+ """
26
+ if isinstance(hubert, HubertModel):
27
+ return hubert.to(device)
28
+ if isinstance(hubert, str):
29
+ models, _, _ = load_model_ensemble_and_task([hubert])
30
+ model = models[0].to(device)
31
+ return model
32
+ if "hubert" not in auto_loaded_model:
33
+ hubert = hf_hub_download("lj1995/VoiceConversionWebUI", "hubert_base.pt")
34
+ models, _, _ = load_model_ensemble_and_task([hubert])
35
+ model = models[0].to(device)
36
+ auto_loaded_model["hubert"] = model
37
+ return auto_loaded_model["hubert"]
zerorvc/preprocess/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .preprocess import Preprocessor
2
+ from .crop import crop_feats_length
zerorvc/preprocess/crop.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+ import numpy as np
3
+
4
+
5
+ def crop_feats_length(
6
+ spec: np.ndarray, phone: np.ndarray, pitch: np.ndarray, pitchf: np.ndarray
7
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
8
+ phone_len = phone.shape[0]
9
+ spec_len = spec.shape[1]
10
+ if phone_len != spec_len:
11
+ len_min = min(phone_len, spec_len)
12
+ phone = phone[:len_min, :]
13
+ pitch = pitch[:len_min]
14
+ pitchf = pitchf[:len_min]
15
+ spec = spec[:, :len_min]
16
+ return spec, phone, pitch, pitchf
zerorvc/preprocess/preprocess.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import librosa
3
+ from scipy import signal
4
+ from .slicer2 import Slicer
5
+
6
+
7
+ class Preprocessor:
8
+ def __init__(
9
+ self, sr: int, max_slice_length: float = 3.0, min_slice_length: float = 0.5
10
+ ):
11
+ self.slicer = Slicer(
12
+ sr=sr,
13
+ threshold=-42,
14
+ min_length=1500,
15
+ min_interval=400,
16
+ hop_size=15,
17
+ max_sil_kept=500,
18
+ )
19
+ self.sr = sr
20
+ self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
21
+ self.max_slice_length = max_slice_length
22
+ self.max_slice_length = min_slice_length
23
+ self.overlap = 0.3
24
+ self.tail = self.max_slice_length + self.overlap
25
+ self.max = 0.9
26
+ self.alpha = 0.75
27
+
28
+ def norm(self, samples: np.ndarray) -> np.ndarray:
29
+ sample_max = np.abs(samples).max()
30
+ normalized = samples / sample_max * self.max
31
+ normalized = (normalized * self.alpha) + (samples * (1 - self.alpha))
32
+ return normalized
33
+
34
+ def preprocess_audio(self, y: np.ndarray) -> list[np.ndarray]:
35
+ y = signal.filtfilt(self.bh, self.ah, y)
36
+ audios = []
37
+ for audio in self.slicer.slice(y):
38
+ i = 0
39
+ while True:
40
+ start = int(self.sr * (self.max_slice_length - self.overlap) * i)
41
+ i += 1
42
+ if len(audio[start:]) > self.tail * self.sr:
43
+ slice = audio[start : start + int(self.max_slice_length * self.sr)]
44
+ audios.append(self.norm(slice))
45
+ else:
46
+ slice = audio[start:]
47
+ if len(slice) > self.min_slice_length * self.sr:
48
+ audios.append(self.norm(slice))
49
+ break
50
+ return audios
51
+
52
+ def preprocess_file(self, file_path: str) -> list[np.ndarray]:
53
+ y, _ = librosa.load(file_path, sr=self.sr)
54
+ return self.preprocess_audio(y)
zerorvc/preprocess/slicer2.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # From https://github.com/openvpi/audio-slicer
2
+ # MIT License: https://github.com/openvpi/audio-slicer/blob/main/LICENSE
3
+ from librosa.feature import rms as get_rms
4
+
5
+
6
+ class Slicer:
7
+ def __init__(
8
+ self,
9
+ sr: int,
10
+ threshold: float = -40.0,
11
+ min_length: int = 5000,
12
+ min_interval: int = 300,
13
+ hop_size: int = 20,
14
+ max_sil_kept: int = 5000,
15
+ ):
16
+ if not min_length >= min_interval >= hop_size:
17
+ raise ValueError(
18
+ "The following condition must be satisfied: min_length >= min_interval >= hop_size"
19
+ )
20
+ if not max_sil_kept >= hop_size:
21
+ raise ValueError(
22
+ "The following condition must be satisfied: max_sil_kept >= hop_size"
23
+ )
24
+ min_interval = sr * min_interval / 1000
25
+ self.threshold = 10 ** (threshold / 20.0)
26
+ self.hop_size = round(sr * hop_size / 1000)
27
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
28
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
29
+ self.min_interval = round(min_interval / self.hop_size)
30
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
31
+
32
+ def _apply_slice(self, waveform, begin, end):
33
+ if len(waveform.shape) > 1:
34
+ return waveform[
35
+ :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
36
+ ]
37
+ else:
38
+ return waveform[
39
+ begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
40
+ ]
41
+
42
+ # @timeit
43
+ def slice(self, waveform):
44
+ if len(waveform.shape) > 1:
45
+ samples = waveform.mean(axis=0)
46
+ else:
47
+ samples = waveform
48
+ if samples.shape[0] <= self.min_length:
49
+ return [waveform]
50
+ rms_list = get_rms(
51
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
52
+ ).squeeze(0)
53
+ sil_tags = []
54
+ silence_start = None
55
+ clip_start = 0
56
+ for i, rms in enumerate(rms_list):
57
+ # Keep looping while frame is silent.
58
+ if rms < self.threshold:
59
+ # Record start of silent frames.
60
+ if silence_start is None:
61
+ silence_start = i
62
+ continue
63
+ # Keep looping while frame is not silent and silence start has not been recorded.
64
+ if silence_start is None:
65
+ continue
66
+ # Clear recorded silence start if interval is not enough or clip is too short
67
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
68
+ need_slice_middle = (
69
+ i - silence_start >= self.min_interval
70
+ and i - clip_start >= self.min_length
71
+ )
72
+ if not is_leading_silence and not need_slice_middle:
73
+ silence_start = None
74
+ continue
75
+ # Need slicing. Record the range of silent frames to be removed.
76
+ if i - silence_start <= self.max_sil_kept:
77
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
78
+ if silence_start == 0:
79
+ sil_tags.append((0, pos))
80
+ else:
81
+ sil_tags.append((pos, pos))
82
+ clip_start = pos
83
+ elif i - silence_start <= self.max_sil_kept * 2:
84
+ pos = rms_list[
85
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
86
+ ].argmin()
87
+ pos += i - self.max_sil_kept
88
+ pos_l = (
89
+ rms_list[
90
+ silence_start : silence_start + self.max_sil_kept + 1
91
+ ].argmin()
92
+ + silence_start
93
+ )
94
+ pos_r = (
95
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
96
+ + i
97
+ - self.max_sil_kept
98
+ )
99
+ if silence_start == 0:
100
+ sil_tags.append((0, pos_r))
101
+ clip_start = pos_r
102
+ else:
103
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
104
+ clip_start = max(pos_r, pos)
105
+ else:
106
+ pos_l = (
107
+ rms_list[
108
+ silence_start : silence_start + self.max_sil_kept + 1
109
+ ].argmin()
110
+ + silence_start
111
+ )
112
+ pos_r = (
113
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
114
+ + i
115
+ - self.max_sil_kept
116
+ )
117
+ if silence_start == 0:
118
+ sil_tags.append((0, pos_r))
119
+ else:
120
+ sil_tags.append((pos_l, pos_r))
121
+ clip_start = pos_r
122
+ silence_start = None
123
+ # Deal with trailing silence.
124
+ total_frames = rms_list.shape[0]
125
+ if (
126
+ silence_start is not None
127
+ and total_frames - silence_start >= self.min_interval
128
+ ):
129
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
130
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
131
+ sil_tags.append((pos, total_frames + 1))
132
+ # Apply and return slices.
133
+ if len(sil_tags) == 0:
134
+ return [waveform]
135
+ else:
136
+ chunks = []
137
+ if sil_tags[0][0] > 0:
138
+ chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
139
+ for i in range(len(sil_tags) - 1):
140
+ chunks.append(
141
+ self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
142
+ )
143
+ if sil_tags[-1][1] < total_frames:
144
+ chunks.append(
145
+ self._apply_slice(waveform, sil_tags[-1][1], total_frames)
146
+ )
147
+ return chunks
zerorvc/pretrained.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+ from huggingface_hub import hf_hub_download
3
+
4
+
5
+ def pretrained_checkpoints() -> Tuple[str, str]:
6
+ """
7
+ The pretrained checkpoints from the Hugging Face Hub.
8
+
9
+ Returns:
10
+ A tuple containing the paths to the downloaded checkpoints for the generator (G) and discriminator (D).
11
+ """
12
+ G = hf_hub_download("lj1995/VoiceConversionWebUI", "pretrained_v2/f0G48k.pth")
13
+ D = hf_hub_download("lj1995/VoiceConversionWebUI", "pretrained_v2/f0D48k.pth")
14
+ return G, D
zerorvc/rvc.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from logging import getLogger
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn.functional as F
6
+ import librosa
7
+ from accelerate import Accelerator
8
+ from datasets import Dataset
9
+
10
+ from .f0 import F0Extractor, RMVPE, load_rmvpe
11
+ from .hubert import HubertFeatureExtractor, HubertModel, load_hubert
12
+ from .synthesizer import SynthesizerTrnMs768NSFsid
13
+ from .constants import *
14
+
15
+ logger = getLogger(__name__)
16
+
17
+
18
+ class RVC:
19
+ """
20
+ RVC (Retrieval-based Voice Conversion) class for converting speech using a pre-trained model.
21
+
22
+ Args:
23
+ name (str | SynthesizerTrnMs768NSFsid): The name of the pre-trained model or the model instance itself.
24
+ sr (int, optional): The sample rate of the input audio. Defaults to SR_48K.
25
+ segment_size (float, optional): The segment size for splitting the input audio. Defaults to 30.0 seconds.
26
+ hubert (str | HubertModel | None, optional): The name of the pre-trained Hubert model or the model instance itself. Defaults to None.
27
+ rmvpe (str | RMVPE | None, optional): The name of the pre-trained RMVPE model or the model instance itself. Defaults to None.
28
+ accelerator (Accelerator, optional): The accelerator device for model inference. Defaults to Accelerator().
29
+ from_pretrained_kwargs (dict, optional): Additional keyword arguments for loading the pre-trained model. Defaults to {}.
30
+
31
+ Methods:
32
+ from_pretrained(name, sr=SR_48K, hubert=None, rmvpe=None, accelerator=Accelerator(), **from_pretrained_kwargs):
33
+ Creates an instance of RVC using the from_pretrained method.
34
+
35
+ convert(audio, protect=0.33):
36
+ Converts the input audio to the target voice using the pre-trained model.
37
+
38
+ convert_dataset(dataset, protect=0.33):
39
+ Converts a dataset of audio samples to the target voice using the pre-trained model.
40
+
41
+ convert_file(audio, protect=0.33):
42
+ Converts a single audio file to the target voice using the pre-trained model.
43
+
44
+ convert_from_wav16k(wav16k, protect=0.33):
45
+ Converts a 16kHz waveform to the target voice using the pre-trained model.
46
+
47
+ convert_from_features(phone, pitchf, pitch, protect=0.33):
48
+ Converts audio features (phone, pitchf, pitch) to the target voice using the pre-trained model.
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ name: str | SynthesizerTrnMs768NSFsid,
54
+ sr=SR_48K,
55
+ segment_size=30.0,
56
+ hubert: str | HubertModel | None = None,
57
+ rmvpe: str | RMVPE | None = None,
58
+ accelerator: Accelerator = Accelerator(),
59
+ from_pretrained_kwargs={},
60
+ ):
61
+ """
62
+ Initializes an instance of the RVC class.
63
+
64
+ Args:
65
+ name (str | SynthesizerTrnMs768NSFsid): The name of the pre-trained model or the model instance itself.
66
+ sr (int, optional): The sample rate of the input audio. Defaults to SR_48K.
67
+ hubert (str | HubertModel | None, optional): The name of the pre-trained Hubert model or the model instance itself. Defaults to None.
68
+ rmvpe (str | RMVPE | None, optional): The name of the pre-trained RMVPE model or the model instance itself. Defaults to None.
69
+ accelerator (Accelerator, optional): The accelerator device for model inference. Defaults to Accelerator().
70
+ from_pretrained_kwargs (dict, optional): Additional keyword arguments for loading the pre-trained model. Defaults to {}.
71
+ """
72
+ self.model = (
73
+ SynthesizerTrnMs768NSFsid.from_pretrained(name, **from_pretrained_kwargs)
74
+ if isinstance(name, str)
75
+ else name
76
+ )
77
+ self.model = self.model.to(accelerator.device)
78
+ self.sr = sr
79
+ self.segment_size = segment_size
80
+ self.hubert = HubertFeatureExtractor(load_hubert(hubert, accelerator.device))
81
+ self.rmvpe = F0Extractor(load_rmvpe(rmvpe, accelerator.device))
82
+ self.accelerator = accelerator
83
+
84
+ @staticmethod
85
+ def from_pretrained(
86
+ name: str,
87
+ sr=SR_48K,
88
+ segment_size=30.0,
89
+ hubert: str | HubertModel | None = None,
90
+ rmvpe: str | RMVPE | None = None,
91
+ accelerator: Accelerator = Accelerator(),
92
+ **from_pretrained_kwargs,
93
+ ):
94
+ """
95
+ Creates an instance of RVC using the from_pretrained method.
96
+
97
+ Args:
98
+ name (str): The name of the pre-trained model.
99
+ sr (int, optional): The sample rate of the input audio. Defaults to SR_48K.
100
+ segment_size (float, optional): The segment size for splitting the input audio. Defaults to 30.0 seconds.
101
+ hubert (str | HubertModel | None, optional): The name of the pre-trained Hubert model or the model instance itself. Defaults to None.
102
+ rmvpe (str | RMVPE | None, optional): The name of the pre-trained RMVPE model or the model instance itself. Defaults to None.
103
+ accelerator (Accelerator, optional): The accelerator device for model inference. Defaults to Accelerator().
104
+ from_pretrained_kwargs (dict): Additional keyword arguments for loading the pre-trained model.
105
+
106
+ Returns:
107
+ RVC: An instance of the RVC class.
108
+ """
109
+ return RVC(
110
+ name, sr, segment_size, hubert, rmvpe, accelerator, from_pretrained_kwargs
111
+ )
112
+
113
+ def convert(
114
+ self, audio: str | Dataset | np.ndarray, protect=0.33, pitch_modification=0.0
115
+ ):
116
+ """
117
+ Converts the input audio to the target voice using the pre-trained model.
118
+
119
+ Args:
120
+ audio (str | Dataset | np.ndarray): The input audio to be converted. It can be a file path, a dataset of audio samples, or a numpy array.
121
+ protect (float, optional): The protection factor for preserving the original voice. Defaults to 0.33.
122
+ pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0.
123
+
124
+ Returns:
125
+ np.ndarray: The converted audio in the target voice.
126
+ If the input is a dataset, it yields the converted audio samples one by one.
127
+ """
128
+ logger.info(
129
+ f"audio: {audio}, protect: {protect}, pitch_modification: {pitch_modification}"
130
+ )
131
+ if isinstance(audio, str):
132
+ return self.convert_file(audio, protect, pitch_modification)
133
+ if isinstance(audio, Dataset):
134
+ return self.convert_dataset(audio, protect, pitch_modification)
135
+ return self.convert_from_wav16k(audio, protect, pitch_modification)
136
+
137
+ def convert_dataset(self, dataset: Dataset, protect=0.33, pitch_modification=0.0):
138
+ """
139
+ Converts a dataset of audio samples to the target voice using the pre-trained model.
140
+
141
+ Args:
142
+ dataset (Dataset): The dataset of audio samples to be converted.
143
+ protect (float, optional): The protection factor for preserving the original voice. Defaults to 0.33.
144
+ pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0.
145
+
146
+ Yields:
147
+ np.ndarray: The converted audio samples in the target voice.
148
+ """
149
+ for i, data in enumerate(dataset):
150
+ logger.info(f"Converting data {i}")
151
+ phone = data["hubert_feats"]
152
+ pitchf = data["f0nsf"]
153
+ pitch = data["f0"]
154
+ yield self.convert_from_features(
155
+ phone, pitchf, pitch, protect, pitch_modification
156
+ )
157
+
158
+ def convert_file(
159
+ self, audio: str, protect=0.33, pitch_modification=0.0
160
+ ) -> np.ndarray:
161
+ """
162
+ Converts a single audio file to the target voice using the pre-trained model.
163
+
164
+ Args:
165
+ audio (str): The path to the audio file to be converted.
166
+ protect (float, optional): The protection factor for preserving the original voice. Defaults to 0.33.
167
+ pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0.
168
+
169
+ Returns:
170
+ np.ndarray: The converted audio in the target voice.
171
+ """
172
+ wav16k, _ = librosa.load(audio, sr=SR_16K)
173
+ logger.info(f"Loaded {audio} with shape {wav16k.shape}")
174
+ return self.convert_from_wav16k(wav16k, protect, pitch_modification)
175
+
176
+ def convert_from_wav16k(
177
+ self, wav16k: np.ndarray, protect=0.33, pitch_modification=0.0
178
+ ) -> np.ndarray:
179
+ """
180
+ Converts a 16kHz waveform to the target voice using the pre-trained model.
181
+
182
+ Args:
183
+ wav16k (np.ndarray): The 16kHz waveform to be converted.
184
+ protect (float, optional): The protection factor for preserving the original voice. Defaults to 0.33.
185
+ pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0.
186
+
187
+ Returns:
188
+ np.ndarray: The converted audio in the target voice.
189
+ """
190
+
191
+ ret = []
192
+ segment_size = int(self.segment_size * SR_16K)
193
+ for i in range(0, len(wav16k), segment_size):
194
+ segment = wav16k[i : i + segment_size]
195
+ segment = np.pad(segment, (SR_16K, SR_16K), mode="reflect")
196
+ logger.info(f"Padded audio with shape {segment.shape}")
197
+
198
+ pitchf, pitch = self.rmvpe.extract_f0_from(segment)
199
+ phone = self.hubert.extract_feature_from(segment)
200
+
201
+ ret.append(
202
+ self.convert_from_features(
203
+ phone, pitchf, pitch, protect, pitch_modification
204
+ )[self.sr : -self.sr]
205
+ )
206
+
207
+ return np.concatenate(ret)
208
+
209
+ def convert_from_features(
210
+ self,
211
+ phone: np.ndarray,
212
+ pitchf: np.ndarray,
213
+ pitch: np.ndarray,
214
+ protect=0.33,
215
+ pitch_modification=0.0,
216
+ ) -> np.ndarray:
217
+ """
218
+ Converts audio features (phone, pitchf, pitch) to the target voice using the pre-trained model.
219
+
220
+ Args:
221
+ phone (np.ndarray): The phone features of the audio.
222
+ pitchf (np.ndarray): The pitch features of the audio.
223
+ pitch (np.ndarray): The pitch values of the audio.
224
+ protect (float, optional): The protection factor for preserving the original voice. Defaults to 0.33.
225
+ pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0.
226
+
227
+ Returns:
228
+ np.ndarray: The converted audio in the target voice.
229
+ """
230
+ use_protect = protect < 0.5
231
+
232
+ if pitch_modification != 0.0:
233
+ pitchf *= pow(2, pitch_modification / 12)
234
+ pitch = self.rmvpe.calculate_f0_from_f0nsf(pitchf)
235
+
236
+ pitchf = np.expand_dims(pitchf, axis=0)
237
+ pitch = np.expand_dims(pitch, axis=0)
238
+ phone = np.expand_dims(phone, axis=0)
239
+
240
+ self.model.eval()
241
+ with torch.no_grad(), self.accelerator.device:
242
+ pitchf = torch.from_numpy(pitchf).to(
243
+ dtype=torch.float32, device=self.accelerator.device
244
+ )
245
+ pitch = torch.from_numpy(pitch).to(
246
+ dtype=torch.long, device=self.accelerator.device
247
+ )
248
+ phone = torch.from_numpy(phone).to(
249
+ dtype=torch.float32, device=self.accelerator.device
250
+ )
251
+
252
+ if use_protect:
253
+ feats0 = phone.clone()
254
+
255
+ feats: torch.Tensor = F.interpolate(
256
+ phone.permute(0, 2, 1), scale_factor=2
257
+ ).permute(0, 2, 1)
258
+ if use_protect:
259
+ feats0: torch.Tensor = F.interpolate(
260
+ feats0.permute(0, 2, 1), scale_factor=2
261
+ ).permute(0, 2, 1)
262
+
263
+ # It's originally like this, but I think it's ok to assume that feats.shape[1] <= phone_len
264
+ # maybe we should use the same crop function from preprocessor
265
+ # phone_len = wav16k.shape[0] // 160
266
+ # if feats.shape[1] < phone_len:
267
+ # ...
268
+ phone_len = feats.shape[1]
269
+ pitch = pitch[:, :phone_len]
270
+ pitchf = pitchf[:, :phone_len]
271
+
272
+ if use_protect:
273
+ pitchff = pitchf.clone()
274
+ pitchff[pitchf > 0] = 1
275
+ pitchff[pitchf < 1] = protect
276
+ pitchff = pitchff.unsqueeze(-1)
277
+ feats = feats * pitchff + feats0 * (1 - pitchff)
278
+ feats = feats.to(feats0.dtype)
279
+
280
+ phone_len = torch.tensor([phone_len], dtype=torch.long)
281
+ sid = torch.tensor([0], dtype=torch.long)
282
+
283
+ logger.info(f"Feats shape: {feats.shape}")
284
+ logger.info(f"Phone len: {phone_len}")
285
+ logger.info(f"Pitch shape: {pitch.shape}")
286
+ logger.info(f"Pitchf shape: {pitchf.shape}")
287
+ logger.info(f"SID shape: {sid}")
288
+ audio_segment = (
289
+ self.model.infer(feats, phone_len, pitch, pitchf, sid)[0][0, 0]
290
+ .data.cpu()
291
+ .float()
292
+ .numpy()
293
+ )
294
+ logger.info(
295
+ f"Generated audio shape: {audio_segment.shape} {audio_segment.dtype}"
296
+ )
297
+ return audio_segment
zerorvc/synthesizer/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .models import SynthesizerTrnMs768NSFsid, MultiPeriodDiscriminator
zerorvc/synthesizer/attentions.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Optional
3
+
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ from . import commons
9
+ from .modules import LayerNorm
10
+
11
+
12
+ class Encoder(nn.Module):
13
+ def __init__(
14
+ self,
15
+ hidden_channels: int,
16
+ filter_channels: int,
17
+ n_heads: int,
18
+ n_layers: int,
19
+ kernel_size=1,
20
+ p_dropout=0.0,
21
+ window_size=10,
22
+ ):
23
+ super().__init__()
24
+ self.hidden_channels = hidden_channels
25
+ self.filter_channels = filter_channels
26
+ self.n_heads = n_heads
27
+ self.n_layers = int(n_layers)
28
+ self.kernel_size = kernel_size
29
+ self.p_dropout = p_dropout
30
+ self.window_size = window_size
31
+
32
+ self.drop = nn.Dropout(p_dropout)
33
+ self.attn_layers = nn.ModuleList()
34
+ self.norm_layers_1 = nn.ModuleList()
35
+ self.ffn_layers = nn.ModuleList()
36
+ self.norm_layers_2 = nn.ModuleList()
37
+ for i in range(self.n_layers):
38
+ self.attn_layers.append(
39
+ MultiHeadAttention(
40
+ hidden_channels,
41
+ hidden_channels,
42
+ n_heads,
43
+ p_dropout=p_dropout,
44
+ window_size=window_size,
45
+ )
46
+ )
47
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
48
+ self.ffn_layers.append(
49
+ FFN(
50
+ hidden_channels,
51
+ hidden_channels,
52
+ filter_channels,
53
+ kernel_size,
54
+ p_dropout=p_dropout,
55
+ )
56
+ )
57
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
58
+
59
+ def forward(self, x, x_mask):
60
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
61
+ x = x * x_mask
62
+ zippep = zip(
63
+ self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
64
+ )
65
+ for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep:
66
+ y = attn_layers(x, x, attn_mask)
67
+ y = self.drop(y)
68
+ x = norm_layers_1(x + y)
69
+
70
+ y = ffn_layers(x, x_mask)
71
+ y = self.drop(y)
72
+ x = norm_layers_2(x + y)
73
+ x = x * x_mask
74
+ return x
75
+
76
+
77
+ class Decoder(nn.Module):
78
+ def __init__(
79
+ self,
80
+ hidden_channels: int,
81
+ filter_channels: int,
82
+ n_heads: int,
83
+ n_layers: int,
84
+ kernel_size=1,
85
+ p_dropout=0.0,
86
+ proximal_bias=False,
87
+ proximal_init=True,
88
+ ):
89
+ super().__init__()
90
+ self.hidden_channels = hidden_channels
91
+ self.filter_channels = filter_channels
92
+ self.n_heads = n_heads
93
+ self.n_layers = n_layers
94
+ self.kernel_size = kernel_size
95
+ self.p_dropout = p_dropout
96
+ self.proximal_bias = proximal_bias
97
+ self.proximal_init = proximal_init
98
+
99
+ self.drop = nn.Dropout(p_dropout)
100
+ self.self_attn_layers = nn.ModuleList()
101
+ self.norm_layers_0 = nn.ModuleList()
102
+ self.encdec_attn_layers = nn.ModuleList()
103
+ self.norm_layers_1 = nn.ModuleList()
104
+ self.ffn_layers = nn.ModuleList()
105
+ self.norm_layers_2 = nn.ModuleList()
106
+ for i in range(self.n_layers):
107
+ self.self_attn_layers.append(
108
+ MultiHeadAttention(
109
+ hidden_channels,
110
+ hidden_channels,
111
+ n_heads,
112
+ p_dropout=p_dropout,
113
+ proximal_bias=proximal_bias,
114
+ proximal_init=proximal_init,
115
+ )
116
+ )
117
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
118
+ self.encdec_attn_layers.append(
119
+ MultiHeadAttention(
120
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
121
+ )
122
+ )
123
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
124
+ self.ffn_layers.append(
125
+ FFN(
126
+ hidden_channels,
127
+ hidden_channels,
128
+ filter_channels,
129
+ kernel_size,
130
+ p_dropout=p_dropout,
131
+ causal=True,
132
+ )
133
+ )
134
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
135
+
136
+ def forward(
137
+ self,
138
+ x: torch.Tensor,
139
+ x_mask: torch.Tensor,
140
+ h: torch.Tensor,
141
+ h_mask: torch.Tensor,
142
+ ):
143
+ """
144
+ x: decoder input
145
+ h: encoder output
146
+ """
147
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
148
+ device=x.device, dtype=x.dtype
149
+ )
150
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
151
+ x = x * x_mask
152
+ for i in range(self.n_layers):
153
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
154
+ y = self.drop(y)
155
+ x = self.norm_layers_0[i](x + y)
156
+
157
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
158
+ y = self.drop(y)
159
+ x = self.norm_layers_1[i](x + y)
160
+
161
+ y = self.ffn_layers[i](x, x_mask)
162
+ y = self.drop(y)
163
+ x = self.norm_layers_2[i](x + y)
164
+ x = x * x_mask
165
+ return x
166
+
167
+
168
+ class MultiHeadAttention(nn.Module):
169
+ def __init__(
170
+ self,
171
+ channels: int,
172
+ out_channels: int,
173
+ n_heads: int,
174
+ p_dropout=0.0,
175
+ window_size: int = None,
176
+ heads_share=True,
177
+ block_length: int = None,
178
+ proximal_bias=False,
179
+ proximal_init=False,
180
+ ):
181
+ super().__init__()
182
+ assert channels % n_heads == 0
183
+
184
+ self.channels = channels
185
+ self.out_channels = out_channels
186
+ self.n_heads = n_heads
187
+ self.p_dropout = p_dropout
188
+ self.window_size = window_size
189
+ self.heads_share = heads_share
190
+ self.block_length = block_length
191
+ self.proximal_bias = proximal_bias
192
+ self.proximal_init = proximal_init
193
+ self.attn = None
194
+
195
+ self.k_channels = channels // n_heads
196
+ self.conv_q = nn.Conv1d(channels, channels, 1)
197
+ self.conv_k = nn.Conv1d(channels, channels, 1)
198
+ self.conv_v = nn.Conv1d(channels, channels, 1)
199
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
200
+ self.drop = nn.Dropout(p_dropout)
201
+
202
+ if window_size is not None:
203
+ n_heads_rel = 1 if heads_share else n_heads
204
+ rel_stddev = self.k_channels**-0.5
205
+ self.emb_rel_k = nn.Parameter(
206
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
207
+ * rel_stddev
208
+ )
209
+ self.emb_rel_v = nn.Parameter(
210
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
211
+ * rel_stddev
212
+ )
213
+
214
+ nn.init.xavier_uniform_(self.conv_q.weight)
215
+ nn.init.xavier_uniform_(self.conv_k.weight)
216
+ nn.init.xavier_uniform_(self.conv_v.weight)
217
+ if proximal_init:
218
+ with torch.no_grad():
219
+ self.conv_k.weight.copy_(self.conv_q.weight)
220
+ self.conv_k.bias.copy_(self.conv_q.bias)
221
+
222
+ def forward(
223
+ self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None
224
+ ):
225
+ q = self.conv_q(x)
226
+ k = self.conv_k(c)
227
+ v = self.conv_v(c)
228
+
229
+ x, _ = self.attention(q, k, v, mask=attn_mask)
230
+
231
+ x = self.conv_o(x)
232
+ return x
233
+
234
+ def attention(
235
+ self,
236
+ query: torch.Tensor,
237
+ key: torch.Tensor,
238
+ value: torch.Tensor,
239
+ mask: Optional[torch.Tensor] = None,
240
+ ):
241
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
242
+ b, d, t_s = key.size()
243
+ t_t = query.size(2)
244
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
245
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
246
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
247
+
248
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
249
+ if self.window_size is not None:
250
+ assert (
251
+ t_s == t_t
252
+ ), "Relative attention is only available for self-attention."
253
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
254
+ rel_logits = self._matmul_with_relative_keys(
255
+ query / math.sqrt(self.k_channels), key_relative_embeddings
256
+ )
257
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
258
+ scores = scores + scores_local
259
+ if self.proximal_bias:
260
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
261
+ scores = scores + self._attention_bias_proximal(t_s).to(
262
+ device=scores.device, dtype=scores.dtype
263
+ )
264
+ if mask is not None:
265
+ scores = scores.masked_fill(mask == 0, -1e4)
266
+ if self.block_length is not None:
267
+ assert (
268
+ t_s == t_t
269
+ ), "Local attention is only available for self-attention."
270
+ block_mask = (
271
+ torch.ones_like(scores)
272
+ .triu(-self.block_length)
273
+ .tril(self.block_length)
274
+ )
275
+ scores = scores.masked_fill(block_mask == 0, -1e4)
276
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
277
+ p_attn = self.drop(p_attn)
278
+ output = torch.matmul(p_attn, value)
279
+ if self.window_size is not None:
280
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
281
+ value_relative_embeddings = self._get_relative_embeddings(
282
+ self.emb_rel_v, t_s
283
+ )
284
+ output = output + self._matmul_with_relative_values(
285
+ relative_weights, value_relative_embeddings
286
+ )
287
+ output = (
288
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
289
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
290
+ return output, p_attn
291
+
292
+ def _matmul_with_relative_values(self, x: torch.Tensor, y: torch.Tensor):
293
+ """
294
+ x: [b, h, l, m]
295
+ y: [h or 1, m, d]
296
+ ret: [b, h, l, d]
297
+ """
298
+ ret = torch.matmul(x, y.unsqueeze(0))
299
+ return ret
300
+
301
+ def _matmul_with_relative_keys(self, x: torch.Tensor, y: torch.Tensor):
302
+ """
303
+ x: [b, h, l, d]
304
+ y: [h or 1, m, d]
305
+ ret: [b, h, l, m]
306
+ """
307
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
308
+ return ret
309
+
310
+ def _get_relative_embeddings(self, relative_embeddings: torch.Tensor, length: int):
311
+ # max_relative_position = 2 * self.window_size + 1
312
+ # Pad first before slice to avoid using cond ops.
313
+ pad_length: int = max(length - (self.window_size + 1), 0)
314
+ slice_start_position = max((self.window_size + 1) - length, 0)
315
+ slice_end_position = slice_start_position + 2 * length - 1
316
+ if pad_length > 0:
317
+ padded_relative_embeddings = F.pad(
318
+ relative_embeddings,
319
+ # commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
320
+ [0, 0, pad_length, pad_length, 0, 0],
321
+ )
322
+ else:
323
+ padded_relative_embeddings = relative_embeddings
324
+ used_relative_embeddings = padded_relative_embeddings[
325
+ :, slice_start_position:slice_end_position
326
+ ]
327
+ return used_relative_embeddings
328
+
329
+ def _relative_position_to_absolute_position(self, x: torch.Tensor):
330
+ """
331
+ x: [b, h, l, 2*l-1]
332
+ ret: [b, h, l, l]
333
+ """
334
+ batch, heads, length, _ = x.size()
335
+ # Concat columns of pad to shift from relative to absolute indexing.
336
+ x = F.pad(
337
+ x,
338
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
339
+ [0, 1, 0, 0, 0, 0, 0, 0],
340
+ )
341
+
342
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
343
+ x_flat = x.view([batch, heads, length * 2 * length])
344
+ x_flat = F.pad(
345
+ x_flat,
346
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, int(length) - 1]])
347
+ [0, int(length) - 1, 0, 0, 0, 0],
348
+ )
349
+
350
+ # Reshape and slice out the padded elements.
351
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
352
+ :, :, :length, length - 1 :
353
+ ]
354
+ return x_final
355
+
356
+ def _absolute_position_to_relative_position(self, x: torch.Tensor):
357
+ """
358
+ x: [b, h, l, l]
359
+ ret: [b, h, l, 2*l-1]
360
+ """
361
+ batch, heads, length, _ = x.size()
362
+ # padd along column
363
+ x = F.pad(
364
+ x,
365
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, int(length) - 1]])
366
+ [0, int(length) - 1, 0, 0, 0, 0, 0, 0],
367
+ )
368
+ x_flat = x.view([batch, heads, int(length**2) + int(length * (length - 1))])
369
+ # add 0's in the beginning that will skew the elements after reshape
370
+ x_flat = F.pad(
371
+ x_flat,
372
+ # commons.convert_pad_shape([[0, 0], [0, 0], [int(length), 0]])
373
+ [length, 0, 0, 0, 0, 0],
374
+ )
375
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
376
+ return x_final
377
+
378
+ def _attention_bias_proximal(self, length: int):
379
+ """Bias for self-attention to encourage attention to close positions.
380
+ Args:
381
+ length: an integer scalar.
382
+ Returns:
383
+ a Tensor with shape [1, 1, length, length]
384
+ """
385
+ r = torch.arange(length, dtype=torch.float32)
386
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
387
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
388
+
389
+
390
+ class FFN(nn.Module):
391
+ def __init__(
392
+ self,
393
+ in_channels: int,
394
+ out_channels: int,
395
+ filter_channels: int,
396
+ kernel_size: int,
397
+ p_dropout=0.0,
398
+ activation: str = None,
399
+ causal=False,
400
+ ):
401
+ super().__init__()
402
+ self.in_channels = in_channels
403
+ self.out_channels = out_channels
404
+ self.filter_channels = filter_channels
405
+ self.kernel_size = kernel_size
406
+ self.p_dropout = p_dropout
407
+ self.activation = activation
408
+ self.causal = causal
409
+ self.is_activation = True if activation == "gelu" else False
410
+ # if causal:
411
+ # self.padding = self._causal_padding
412
+ # else:
413
+ # self.padding = self._same_padding
414
+
415
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
416
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
417
+ self.drop = nn.Dropout(p_dropout)
418
+
419
+ def padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor:
420
+ if self.causal:
421
+ padding = self._causal_padding(x * x_mask)
422
+ else:
423
+ padding = self._same_padding(x * x_mask)
424
+ return padding
425
+
426
+ def forward(self, x: torch.Tensor, x_mask: torch.Tensor):
427
+ x = self.conv_1(self.padding(x, x_mask))
428
+ if self.is_activation:
429
+ x = x * torch.sigmoid(1.702 * x)
430
+ else:
431
+ x = torch.relu(x)
432
+ x = self.drop(x)
433
+
434
+ x = self.conv_2(self.padding(x, x_mask))
435
+ return x * x_mask
436
+
437
+ def _causal_padding(self, x: torch.Tensor):
438
+ if self.kernel_size == 1:
439
+ return x
440
+ pad_l: int = self.kernel_size - 1
441
+ pad_r: int = 0
442
+ # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
443
+ x = F.pad(
444
+ x,
445
+ # commons.convert_pad_shape(padding)
446
+ [pad_l, pad_r, 0, 0, 0, 0],
447
+ )
448
+ return x
449
+
450
+ def _same_padding(self, x: torch.Tensor):
451
+ if self.kernel_size == 1:
452
+ return x
453
+ pad_l: int = (self.kernel_size - 1) // 2
454
+ pad_r: int = self.kernel_size // 2
455
+ # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
456
+ x = F.pad(
457
+ x,
458
+ # commons.convert_pad_shape(padding)
459
+ [pad_l, pad_r, 0, 0, 0, 0],
460
+ )
461
+ return x
zerorvc/synthesizer/commons.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ import math
3
+
4
+ import torch
5
+ from torch.nn import functional as F
6
+
7
+
8
+ def init_weights(m, mean=0.0, std=0.01):
9
+ classname = m.__class__.__name__
10
+ if classname.find("Conv") != -1:
11
+ m.weight.data.normal_(mean, std)
12
+
13
+
14
+ def get_padding(kernel_size: int, dilation=1):
15
+ return int((kernel_size * dilation - dilation) / 2)
16
+
17
+
18
+ # def convert_pad_shape(pad_shape):
19
+ # l = pad_shape[::-1]
20
+ # pad_shape = [item for sublist in l for item in sublist]
21
+ # return pad_shape
22
+
23
+
24
+ def kl_divergence(
25
+ m_p: torch.Tensor, logs_p: torch.Tensor, m_q: torch.Tensor, logs_q: torch.Tensor
26
+ ):
27
+ """KL(P||Q)"""
28
+ kl = (logs_q - logs_p) - 0.5
29
+ kl += (
30
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
31
+ )
32
+ return kl
33
+
34
+
35
+ def rand_gumbel(shape):
36
+ """Sample from the Gumbel distribution, protect from overflows."""
37
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
38
+ return -torch.log(-torch.log(uniform_samples))
39
+
40
+
41
+ def rand_gumbel_like(x: torch.Tensor):
42
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
43
+ return g
44
+
45
+
46
+ def slice_segments(x: torch.Tensor, ids_str, segment_size=4):
47
+ ret = torch.zeros_like(x[:, :, :segment_size])
48
+ for i in range(x.size(0)):
49
+ idx_str = ids_str[i]
50
+ idx_end = idx_str + segment_size
51
+ ret[i] = x[i, :, idx_str:idx_end]
52
+ return ret
53
+
54
+
55
+ def slice_segments2(x: torch.Tensor, ids_str, segment_size=4):
56
+ ret = torch.zeros_like(x[:, :segment_size])
57
+ for i in range(x.size(0)):
58
+ idx_str = ids_str[i]
59
+ idx_end = idx_str + segment_size
60
+ ret[i] = x[i, idx_str:idx_end]
61
+ return ret
62
+
63
+
64
+ def rand_slice_segments(x: torch.Tensor, x_lengths=None, segment_size=4):
65
+ b, d, t = x.size()
66
+ if x_lengths is None:
67
+ x_lengths = t
68
+ ids_str_max = x_lengths - segment_size + 1
69
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
70
+ ret = slice_segments(x, ids_str, segment_size)
71
+ return ret, ids_str
72
+
73
+
74
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
75
+ position = torch.arange(length, dtype=torch.float)
76
+ num_timescales = channels // 2
77
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
78
+ num_timescales - 1
79
+ )
80
+ inv_timescales = min_timescale * torch.exp(
81
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
82
+ )
83
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
84
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
85
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
86
+ signal = signal.view(1, channels, length)
87
+ return signal
88
+
89
+
90
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
91
+ b, channels, length = x.size()
92
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
93
+ return x + signal.to(dtype=x.dtype, device=x.device)
94
+
95
+
96
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
97
+ b, channels, length = x.size()
98
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
99
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
100
+
101
+
102
+ def subsequent_mask(length):
103
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
104
+ return mask
105
+
106
+
107
+ @torch.jit.script
108
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
109
+ n_channels_int = n_channels[0]
110
+ in_act = input_a + input_b
111
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
112
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
113
+ acts = t_act * s_act
114
+ return acts
115
+
116
+
117
+ # def convert_pad_shape(pad_shape):
118
+ # l = pad_shape[::-1]
119
+ # pad_shape = [item for sublist in l for item in sublist]
120
+ # return pad_shape
121
+
122
+
123
+ def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
124
+ return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
125
+
126
+
127
+ def shift_1d(x):
128
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
129
+ return x
130
+
131
+
132
+ def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
133
+ if max_length is None:
134
+ max_length = length.max()
135
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
136
+ return x.unsqueeze(0) < length.unsqueeze(1)
137
+
138
+
139
+ def generate_path(duration, mask):
140
+ """
141
+ duration: [b, 1, t_x]
142
+ mask: [b, 1, t_y, t_x]
143
+ """
144
+ device = duration.device
145
+
146
+ b, _, t_y, t_x = mask.shape
147
+ cum_duration = torch.cumsum(duration, -1)
148
+
149
+ cum_duration_flat = cum_duration.view(b * t_x)
150
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
151
+ path = path.view(b, t_x, t_y)
152
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
153
+ path = path.unsqueeze(1).transpose(2, 3) * mask
154
+ return path
155
+
156
+
157
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
158
+ if isinstance(parameters, torch.Tensor):
159
+ parameters = [parameters]
160
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
161
+ norm_type = float(norm_type)
162
+ if clip_value is not None:
163
+ clip_value = float(clip_value)
164
+
165
+ total_norm = 0
166
+ for p in parameters:
167
+ param_norm = p.grad.data.norm(norm_type)
168
+ total_norm += param_norm.item() ** norm_type
169
+ if clip_value is not None:
170
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
171
+ total_norm = total_norm ** (1.0 / norm_type)
172
+ return total_norm
zerorvc/synthesizer/models.py ADDED
@@ -0,0 +1,871 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import logging
3
+ from typing import List, Literal, Optional
4
+
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+ from torch.nn.utils import remove_weight_norm, spectral_norm
9
+ from torch.nn.utils.parametrizations import weight_norm
10
+ from huggingface_hub import PyTorchModelHubMixin
11
+
12
+ from . import attentions, commons, modules
13
+ from .commons import get_padding, init_weights
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class TextEncoder(nn.Module):
19
+ def __init__(
20
+ self,
21
+ in_channels: int,
22
+ out_channels: int,
23
+ hidden_channels: int,
24
+ filter_channels: int,
25
+ n_heads: int,
26
+ n_layers: int,
27
+ kernel_size: int,
28
+ p_dropout: float,
29
+ f0=True,
30
+ ):
31
+ super().__init__()
32
+ self.out_channels = out_channels
33
+ self.hidden_channels = hidden_channels
34
+ self.filter_channels = filter_channels
35
+ self.n_heads = n_heads
36
+ self.n_layers = n_layers
37
+ self.kernel_size = kernel_size
38
+ self.p_dropout = float(p_dropout)
39
+ self.emb_phone = nn.Linear(in_channels, hidden_channels)
40
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
41
+ if f0 == True:
42
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
43
+ self.encoder = attentions.Encoder(
44
+ hidden_channels,
45
+ filter_channels,
46
+ n_heads,
47
+ n_layers,
48
+ kernel_size,
49
+ float(p_dropout),
50
+ )
51
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
52
+
53
+ def forward(
54
+ self,
55
+ phone: torch.Tensor,
56
+ pitch: torch.Tensor,
57
+ lengths: torch.Tensor,
58
+ skip_head: Optional[torch.Tensor] = None,
59
+ ):
60
+ if pitch is None:
61
+ x = self.emb_phone(phone)
62
+ else:
63
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
64
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
65
+ x = self.lrelu(x)
66
+ x = torch.transpose(x, 1, -1) # [b, h, t]
67
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
68
+ x.dtype
69
+ )
70
+ x = self.encoder(x * x_mask, x_mask)
71
+ if skip_head is not None:
72
+ assert isinstance(skip_head, torch.Tensor)
73
+ head = int(skip_head.item())
74
+ x = x[:, :, head:]
75
+ x_mask = x_mask[:, :, head:]
76
+ stats = self.proj(x) * x_mask
77
+ m, logs = torch.split(stats, self.out_channels, dim=1)
78
+ return m, logs, x_mask
79
+
80
+
81
+ class ResidualCouplingBlock(nn.Module):
82
+ def __init__(
83
+ self,
84
+ channels: int,
85
+ hidden_channels: int,
86
+ kernel_size: int,
87
+ dilation_rate: float,
88
+ n_layers: int,
89
+ n_flows=4,
90
+ gin_channels=0,
91
+ ):
92
+ super().__init__()
93
+ self.channels = channels
94
+ self.hidden_channels = hidden_channels
95
+ self.kernel_size = kernel_size
96
+ self.dilation_rate = dilation_rate
97
+ self.n_layers = n_layers
98
+ self.n_flows = n_flows
99
+ self.gin_channels = gin_channels
100
+
101
+ self.flows = nn.ModuleList()
102
+ for i in range(n_flows):
103
+ self.flows.append(
104
+ modules.ResidualCouplingLayer(
105
+ channels,
106
+ hidden_channels,
107
+ kernel_size,
108
+ dilation_rate,
109
+ n_layers,
110
+ gin_channels=gin_channels,
111
+ mean_only=True,
112
+ )
113
+ )
114
+ self.flows.append(modules.Flip())
115
+
116
+ def forward(
117
+ self,
118
+ x: torch.Tensor,
119
+ x_mask: torch.Tensor,
120
+ g: Optional[torch.Tensor] = None,
121
+ reverse: bool = False,
122
+ ):
123
+ if not reverse:
124
+ for flow in self.flows:
125
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
126
+ else:
127
+ for flow in self.flows[::-1]:
128
+ x, _ = flow.forward(x, x_mask, g=g, reverse=reverse)
129
+ return x
130
+
131
+ def remove_weight_norm(self):
132
+ for i in range(self.n_flows):
133
+ self.flows[i * 2].remove_weight_norm()
134
+
135
+
136
+ class PosteriorEncoder(nn.Module):
137
+ def __init__(
138
+ self,
139
+ in_channels: int,
140
+ out_channels: int,
141
+ hidden_channels: int,
142
+ kernel_size: int,
143
+ dilation_rate: float,
144
+ n_layers: int,
145
+ gin_channels=0,
146
+ ):
147
+ super().__init__()
148
+ self.in_channels = in_channels
149
+ self.out_channels = out_channels
150
+ self.hidden_channels = hidden_channels
151
+ self.kernel_size = kernel_size
152
+ self.dilation_rate = dilation_rate
153
+ self.n_layers = n_layers
154
+ self.gin_channels = gin_channels
155
+
156
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
157
+ self.enc = modules.WN(
158
+ hidden_channels,
159
+ kernel_size,
160
+ dilation_rate,
161
+ n_layers,
162
+ gin_channels=gin_channels,
163
+ )
164
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
165
+
166
+ def forward(
167
+ self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
168
+ ):
169
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
170
+ x.dtype
171
+ )
172
+ x = self.pre(x) * x_mask
173
+ x = self.enc(x, x_mask, g=g)
174
+ stats = self.proj(x) * x_mask
175
+ m, logs = torch.split(stats, self.out_channels, dim=1)
176
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
177
+ return z, m, logs, x_mask
178
+
179
+ def remove_weight_norm(self):
180
+ self.enc.remove_weight_norm()
181
+
182
+
183
+ class Generator(torch.nn.Module):
184
+ def __init__(
185
+ self,
186
+ initial_channel: int,
187
+ resblock: Literal["1", "2"],
188
+ resblock_kernel_sizes,
189
+ resblock_dilation_sizes,
190
+ upsample_rates,
191
+ upsample_initial_channel: int,
192
+ upsample_kernel_sizes,
193
+ gin_channels=0,
194
+ ):
195
+ super().__init__()
196
+ self.num_kernels = len(resblock_kernel_sizes)
197
+ self.num_upsamples = len(upsample_rates)
198
+ self.conv_pre = nn.Conv1d(
199
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
200
+ )
201
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
202
+
203
+ self.ups = nn.ModuleList()
204
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
205
+ self.ups.append(
206
+ weight_norm(
207
+ nn.ConvTranspose1d(
208
+ upsample_initial_channel // (2**i),
209
+ upsample_initial_channel // (2 ** (i + 1)),
210
+ k,
211
+ u,
212
+ padding=(k - u) // 2,
213
+ )
214
+ )
215
+ )
216
+
217
+ self.resblocks = nn.ModuleList()
218
+ for i in range(len(self.ups)):
219
+ ch = upsample_initial_channel // (2 ** (i + 1))
220
+ for j, (k, d) in enumerate(
221
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
222
+ ):
223
+ self.resblocks.append(resblock(ch, k, d))
224
+
225
+ self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
226
+ self.ups.apply(init_weights)
227
+
228
+ if gin_channels != 0:
229
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
230
+
231
+ def forward(
232
+ self,
233
+ x: torch.Tensor,
234
+ g: Optional[torch.Tensor] = None,
235
+ n_res: Optional[torch.Tensor] = None,
236
+ ):
237
+ if n_res is not None:
238
+ assert isinstance(n_res, torch.Tensor)
239
+ n = int(n_res.item())
240
+ if n != x.shape[-1]:
241
+ x = F.interpolate(x, size=n, mode="linear")
242
+ x = self.conv_pre(x)
243
+ if g is not None:
244
+ x = x + self.cond(g)
245
+
246
+ for i in range(self.num_upsamples):
247
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
248
+ x = self.ups[i](x)
249
+ xs = None
250
+ for j in range(self.num_kernels):
251
+ if xs is None:
252
+ xs = self.resblocks[i * self.num_kernels + j](x)
253
+ else:
254
+ xs += self.resblocks[i * self.num_kernels + j](x)
255
+ x = xs / self.num_kernels
256
+ x = F.leaky_relu(x)
257
+ x = self.conv_post(x)
258
+ x = torch.tanh(x)
259
+
260
+ return x
261
+
262
+ def remove_weight_norm(self):
263
+ for l in self.ups:
264
+ remove_weight_norm(l)
265
+ for l in self.resblocks:
266
+ l.remove_weight_norm()
267
+
268
+
269
+ class SineGen(torch.nn.Module):
270
+ """Definition of sine generator
271
+ SineGen(samp_rate, harmonic_num = 0,
272
+ sine_amp = 0.1, noise_std = 0.003,
273
+ voiced_threshold = 0,
274
+ flag_for_pulse=False)
275
+ samp_rate: sampling rate in Hz
276
+ harmonic_num: number of harmonic overtones (default 0)
277
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
278
+ noise_std: std of Gaussian noise (default 0.003)
279
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
280
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
281
+ Note: when flag_for_pulse is True, the first time step of a voiced
282
+ segment is always sin(torch.pi) or cos(0)
283
+ """
284
+
285
+ def __init__(
286
+ self,
287
+ samp_rate,
288
+ harmonic_num=0,
289
+ sine_amp=0.1,
290
+ noise_std=0.003,
291
+ voiced_threshold=0,
292
+ flag_for_pulse=False,
293
+ ):
294
+ super().__init__()
295
+ self.sine_amp = sine_amp
296
+ self.noise_std = noise_std
297
+ self.harmonic_num = harmonic_num
298
+ self.dim = self.harmonic_num + 1
299
+ self.sampling_rate = samp_rate
300
+ self.voiced_threshold = voiced_threshold
301
+
302
+ def _f02uv(self, f0):
303
+ # generate uv signal
304
+ uv = torch.ones_like(f0)
305
+ uv = uv * (f0 > self.voiced_threshold)
306
+ return uv
307
+
308
+ def forward(self, f0: torch.Tensor, upp: int):
309
+ """sine_tensor, uv = forward(f0)
310
+ input F0: tensor(batchsize=1, length, dim=1)
311
+ f0 for unvoiced steps should be 0
312
+ output sine_tensor: tensor(batchsize=1, length, dim)
313
+ output uv: tensor(batchsize=1, length, 1)
314
+ """
315
+ with torch.no_grad():
316
+ f0 = f0[:, None].transpose(1, 2)
317
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
318
+ # fundamental component
319
+ f0_buf[:, :, 0] = f0[:, :, 0]
320
+ for idx in range(self.harmonic_num):
321
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
322
+ idx + 2
323
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
324
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
325
+ rand_ini = torch.rand(
326
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
327
+ )
328
+ rand_ini[:, 0] = 0
329
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
330
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
331
+ tmp_over_one *= upp
332
+ tmp_over_one = F.interpolate(
333
+ tmp_over_one.transpose(2, 1),
334
+ scale_factor=float(upp),
335
+ mode="linear",
336
+ align_corners=True,
337
+ ).transpose(2, 1)
338
+ rad_values = F.interpolate(
339
+ rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest"
340
+ ).transpose(
341
+ 2, 1
342
+ ) #######
343
+ tmp_over_one %= 1
344
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
345
+ cumsum_shift = torch.zeros_like(rad_values)
346
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
347
+ sine_waves = torch.sin(
348
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi
349
+ )
350
+ sine_waves = sine_waves * self.sine_amp
351
+ uv = self._f02uv(f0)
352
+ uv = F.interpolate(
353
+ uv.transpose(2, 1), scale_factor=float(upp), mode="nearest"
354
+ ).transpose(2, 1)
355
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
356
+ noise = noise_amp * torch.randn_like(sine_waves)
357
+ sine_waves = sine_waves * uv + noise
358
+ return sine_waves, uv, noise
359
+
360
+
361
+ class SourceModuleHnNSF(torch.nn.Module):
362
+ """SourceModule for hn-nsf
363
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
364
+ add_noise_std=0.003, voiced_threshod=0)
365
+ sampling_rate: sampling_rate in Hz
366
+ harmonic_num: number of harmonic above F0 (default: 0)
367
+ sine_amp: amplitude of sine source signal (default: 0.1)
368
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
369
+ note that amplitude of noise in unvoiced is decided
370
+ by sine_amp
371
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
372
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
373
+ F0_sampled (batchsize, length, 1)
374
+ Sine_source (batchsize, length, 1)
375
+ noise_source (batchsize, length 1)
376
+ uv (batchsize, length, 1)
377
+ """
378
+
379
+ def __init__(
380
+ self,
381
+ sampling_rate: int,
382
+ harmonic_num=0,
383
+ sine_amp=0.1,
384
+ add_noise_std=0.003,
385
+ voiced_threshod=0,
386
+ ):
387
+ super().__init__()
388
+
389
+ self.sine_amp = sine_amp
390
+ self.noise_std = add_noise_std
391
+ # to produce sine waveforms
392
+ self.l_sin_gen = SineGen(
393
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
394
+ )
395
+
396
+ # to merge source harmonics into a single excitation
397
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
398
+ self.l_tanh = torch.nn.Tanh()
399
+ # self.ddtype:int = -1
400
+
401
+ def forward(self, x: torch.Tensor, upp: int = 1):
402
+ # if self.ddtype ==-1:
403
+ # self.ddtype = self.l_linear.weight.dtype
404
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
405
+ # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype)
406
+ # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x)))
407
+ # print(sine_wavs.dtype,self.ddtype)
408
+ # if sine_wavs.dtype != self.l_linear.weight.dtype:
409
+ sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
410
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
411
+ return sine_merge, None, None # noise, uv
412
+
413
+
414
+ class GeneratorNSF(torch.nn.Module):
415
+ def __init__(
416
+ self,
417
+ initial_channel,
418
+ resblock,
419
+ resblock_kernel_sizes,
420
+ resblock_dilation_sizes,
421
+ upsample_rates,
422
+ upsample_initial_channel,
423
+ upsample_kernel_sizes,
424
+ gin_channels,
425
+ sr,
426
+ ):
427
+ super().__init__()
428
+ self.num_kernels = len(resblock_kernel_sizes)
429
+ self.num_upsamples = len(upsample_rates)
430
+
431
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates))
432
+ self.m_source = SourceModuleHnNSF(
433
+ sampling_rate=sr,
434
+ harmonic_num=0,
435
+ )
436
+ self.noise_convs = nn.ModuleList()
437
+ self.conv_pre = nn.Conv1d(
438
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
439
+ )
440
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
441
+
442
+ self.ups = nn.ModuleList()
443
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
444
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
445
+ self.ups.append(
446
+ weight_norm(
447
+ nn.ConvTranspose1d(
448
+ upsample_initial_channel // (2**i),
449
+ upsample_initial_channel // (2 ** (i + 1)),
450
+ k,
451
+ u,
452
+ padding=(k - u) // 2,
453
+ )
454
+ )
455
+ )
456
+ if i + 1 < len(upsample_rates):
457
+ stride_f0 = math.prod(upsample_rates[i + 1 :])
458
+ self.noise_convs.append(
459
+ nn.Conv1d(
460
+ 1,
461
+ c_cur,
462
+ kernel_size=stride_f0 * 2,
463
+ stride=stride_f0,
464
+ padding=stride_f0 // 2,
465
+ )
466
+ )
467
+ else:
468
+ self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=1))
469
+
470
+ self.resblocks = nn.ModuleList()
471
+ for i in range(len(self.ups)):
472
+ ch = upsample_initial_channel // (2 ** (i + 1))
473
+ for j, (k, d) in enumerate(
474
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
475
+ ):
476
+ self.resblocks.append(resblock(ch, k, d))
477
+
478
+ self.conv_post = nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
479
+ self.ups.apply(init_weights)
480
+
481
+ if gin_channels != 0:
482
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
483
+
484
+ self.upp = math.prod(upsample_rates)
485
+
486
+ self.lrelu_slope = modules.LRELU_SLOPE
487
+
488
+ def forward(
489
+ self,
490
+ x,
491
+ f0,
492
+ g: Optional[torch.Tensor] = None,
493
+ n_res: Optional[torch.Tensor] = None,
494
+ ):
495
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
496
+ har_source = har_source.transpose(1, 2)
497
+ if n_res is not None:
498
+ assert isinstance(n_res, torch.Tensor)
499
+ n = int(n_res.item())
500
+ if n * self.upp != har_source.shape[-1]:
501
+ har_source = F.interpolate(har_source, size=n * self.upp, mode="linear")
502
+ if n != x.shape[-1]:
503
+ x = F.interpolate(x, size=n, mode="linear")
504
+ x = self.conv_pre(x)
505
+ if g is not None:
506
+ x = x + self.cond(g)
507
+ for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
508
+ if i < self.num_upsamples:
509
+ x = F.leaky_relu(x, self.lrelu_slope)
510
+ x = ups(x)
511
+ x_source = noise_convs(har_source)
512
+ x = x + x_source
513
+ xs: torch.Tensor = None
514
+ l = [i * self.num_kernels + j for j in range(self.num_kernels)]
515
+ for j, resblock in enumerate(self.resblocks):
516
+ if j in l:
517
+ if xs is None:
518
+ xs = resblock(x)
519
+ else:
520
+ xs += resblock(x)
521
+ x = xs / self.num_kernels
522
+ x = F.leaky_relu(x)
523
+ x = self.conv_post(x)
524
+ x = torch.tanh(x)
525
+
526
+ return x
527
+
528
+ def remove_weight_norm(self):
529
+ for l in self.ups:
530
+ remove_weight_norm(l)
531
+ for l in self.resblocks:
532
+ l.remove_weight_norm()
533
+
534
+
535
+ class SynthesizerTrnMs256NSFsid(nn.Module):
536
+ def __init__(
537
+ self,
538
+ spec_channels,
539
+ segment_size,
540
+ inter_channels,
541
+ hidden_channels,
542
+ filter_channels,
543
+ n_heads,
544
+ n_layers,
545
+ kernel_size,
546
+ p_dropout,
547
+ resblock,
548
+ resblock_kernel_sizes,
549
+ resblock_dilation_sizes,
550
+ upsample_rates,
551
+ upsample_initial_channel,
552
+ upsample_kernel_sizes,
553
+ spk_embed_dim,
554
+ gin_channels,
555
+ sr,
556
+ ):
557
+ super().__init__()
558
+ self.spec_channels = spec_channels
559
+ self.inter_channels = inter_channels
560
+ self.hidden_channels = hidden_channels
561
+ self.filter_channels = filter_channels
562
+ self.n_heads = n_heads
563
+ self.n_layers = n_layers
564
+ self.kernel_size = kernel_size
565
+ self.p_dropout = float(p_dropout)
566
+ self.resblock = resblock
567
+ self.resblock_kernel_sizes = resblock_kernel_sizes
568
+ self.resblock_dilation_sizes = resblock_dilation_sizes
569
+ self.upsample_rates = upsample_rates
570
+ self.upsample_initial_channel = upsample_initial_channel
571
+ self.upsample_kernel_sizes = upsample_kernel_sizes
572
+ self.segment_size = segment_size
573
+ self.gin_channels = gin_channels
574
+ # self.hop_length = hop_length#
575
+ self.spk_embed_dim = spk_embed_dim
576
+ self.enc_p = TextEncoder(
577
+ 256,
578
+ inter_channels,
579
+ hidden_channels,
580
+ filter_channels,
581
+ n_heads,
582
+ n_layers,
583
+ kernel_size,
584
+ float(p_dropout),
585
+ )
586
+ self.dec = GeneratorNSF(
587
+ inter_channels,
588
+ resblock,
589
+ resblock_kernel_sizes,
590
+ resblock_dilation_sizes,
591
+ upsample_rates,
592
+ upsample_initial_channel,
593
+ upsample_kernel_sizes,
594
+ gin_channels=gin_channels,
595
+ sr=sr,
596
+ )
597
+ self.enc_q = PosteriorEncoder(
598
+ spec_channels,
599
+ inter_channels,
600
+ hidden_channels,
601
+ 5,
602
+ 1,
603
+ 16,
604
+ gin_channels=gin_channels,
605
+ )
606
+ self.flow = ResidualCouplingBlock(
607
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
608
+ )
609
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
610
+ logger.debug(
611
+ "gin_channels: "
612
+ + str(gin_channels)
613
+ + ", self.spk_embed_dim: "
614
+ + str(self.spk_embed_dim)
615
+ )
616
+
617
+ def remove_weight_norm(self):
618
+ self.dec.remove_weight_norm()
619
+ self.flow.remove_weight_norm()
620
+ if hasattr(self, "enc_q"):
621
+ self.enc_q.remove_weight_norm()
622
+
623
+ def forward(
624
+ self,
625
+ phone: torch.Tensor,
626
+ phone_lengths: torch.Tensor,
627
+ pitch: torch.Tensor,
628
+ pitchf: torch.Tensor,
629
+ y: torch.Tensor,
630
+ y_lengths: torch.Tensor,
631
+ ds: Optional[torch.Tensor] = None,
632
+ ): # 这里ds是id,[bs,1]
633
+ # print(1,pitch.shape)#[bs,t]
634
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
635
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
636
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
637
+ z_p = self.flow(z, y_mask, g=g)
638
+ z_slice, ids_slice = commons.rand_slice_segments(
639
+ z, y_lengths, self.segment_size
640
+ )
641
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
642
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
643
+ # print(-2,pitchf.shape,z_slice.shape)
644
+ o = self.dec(z_slice, pitchf, g=g)
645
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
646
+
647
+ def infer(
648
+ self,
649
+ phone: torch.Tensor,
650
+ phone_lengths: torch.Tensor,
651
+ pitch: torch.Tensor,
652
+ nsff0: torch.Tensor,
653
+ sid: torch.Tensor,
654
+ skip_head: Optional[torch.Tensor] = None,
655
+ return_length: Optional[torch.Tensor] = None,
656
+ return_length2: Optional[torch.Tensor] = None,
657
+ ):
658
+ g = self.emb_g(sid).unsqueeze(-1)
659
+ if skip_head is not None and return_length is not None:
660
+ assert isinstance(skip_head, torch.Tensor)
661
+ assert isinstance(return_length, torch.Tensor)
662
+ head = int(skip_head.item())
663
+ length = int(return_length.item())
664
+ flow_head = torch.clamp(skip_head - 24, min=0)
665
+ dec_head = head - int(flow_head.item())
666
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths, flow_head)
667
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
668
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
669
+ z = z[:, :, dec_head : dec_head + length]
670
+ x_mask = x_mask[:, :, dec_head : dec_head + length]
671
+ nsff0 = nsff0[:, head : head + length]
672
+ else:
673
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
674
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
675
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
676
+ o = self.dec(z * x_mask, nsff0, g=g, n_res=return_length2)
677
+ return o, x_mask, (z, z_p, m_p, logs_p)
678
+
679
+
680
+ class SynthesizerTrnMs768NSFsid(SynthesizerTrnMs256NSFsid, PyTorchModelHubMixin):
681
+ def __init__(
682
+ self,
683
+ spec_channels: int,
684
+ segment_size: int,
685
+ inter_channels: int,
686
+ hidden_channels: int,
687
+ filter_channels: int,
688
+ n_heads: int,
689
+ n_layers: int,
690
+ kernel_size: int,
691
+ p_dropout: float,
692
+ resblock: Literal["1", "2"],
693
+ resblock_kernel_sizes: List[int],
694
+ resblock_dilation_sizes: list[list[int]],
695
+ upsample_rates: list[int],
696
+ upsample_initial_channel: int,
697
+ upsample_kernel_sizes: list[int],
698
+ spk_embed_dim: int,
699
+ gin_channels: int,
700
+ sr: int,
701
+ ):
702
+ super().__init__(
703
+ spec_channels,
704
+ segment_size,
705
+ inter_channels,
706
+ hidden_channels,
707
+ filter_channels,
708
+ n_heads,
709
+ n_layers,
710
+ kernel_size,
711
+ p_dropout,
712
+ resblock,
713
+ resblock_kernel_sizes,
714
+ resblock_dilation_sizes,
715
+ upsample_rates,
716
+ upsample_initial_channel,
717
+ upsample_kernel_sizes,
718
+ spk_embed_dim,
719
+ gin_channels,
720
+ sr,
721
+ )
722
+ del self.enc_p
723
+ self.enc_p = TextEncoder(
724
+ 768,
725
+ inter_channels,
726
+ hidden_channels,
727
+ filter_channels,
728
+ n_heads,
729
+ n_layers,
730
+ kernel_size,
731
+ float(p_dropout),
732
+ )
733
+
734
+
735
+ class MultiPeriodDiscriminator(torch.nn.Module):
736
+ def __init__(self, use_spectral_norm=False):
737
+ super().__init__()
738
+ # periods = [2, 3, 5, 7, 11, 17]
739
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
740
+
741
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
742
+ discs = discs + [
743
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
744
+ ]
745
+ self.discriminators = nn.ModuleList(discs)
746
+
747
+ def forward(self, y, y_hat):
748
+ y_d_rs = [] #
749
+ y_d_gs = []
750
+ fmap_rs = []
751
+ fmap_gs = []
752
+ for i, d in enumerate(self.discriminators):
753
+ y_d_r, fmap_r = d(y)
754
+ y_d_g, fmap_g = d(y_hat)
755
+ # for j in range(len(fmap_r)):
756
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
757
+ y_d_rs.append(y_d_r)
758
+ y_d_gs.append(y_d_g)
759
+ fmap_rs.append(fmap_r)
760
+ fmap_gs.append(fmap_g)
761
+
762
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
763
+
764
+
765
+ class DiscriminatorS(torch.nn.Module):
766
+ def __init__(self, use_spectral_norm=False):
767
+ super().__init__()
768
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
769
+ self.convs = nn.ModuleList(
770
+ [
771
+ norm_f(nn.Conv1d(1, 16, 15, 1, padding=7)),
772
+ norm_f(nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
773
+ norm_f(nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
774
+ norm_f(nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
775
+ norm_f(nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
776
+ norm_f(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
777
+ ]
778
+ )
779
+ self.conv_post = norm_f(nn.Conv1d(1024, 1, 3, 1, padding=1))
780
+
781
+ def forward(self, x):
782
+ fmap = []
783
+
784
+ for l in self.convs:
785
+ x = l(x)
786
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
787
+ fmap.append(x)
788
+ x = self.conv_post(x)
789
+ fmap.append(x)
790
+ x = torch.flatten(x, 1, -1)
791
+
792
+ return x, fmap
793
+
794
+
795
+ class DiscriminatorP(torch.nn.Module):
796
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
797
+ super().__init__()
798
+ self.period = period
799
+ self.use_spectral_norm = use_spectral_norm
800
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
801
+ self.convs = nn.ModuleList(
802
+ [
803
+ norm_f(
804
+ nn.Conv2d(
805
+ 1,
806
+ 32,
807
+ (kernel_size, 1),
808
+ (stride, 1),
809
+ padding=(get_padding(kernel_size, 1), 0),
810
+ )
811
+ ),
812
+ norm_f(
813
+ nn.Conv2d(
814
+ 32,
815
+ 128,
816
+ (kernel_size, 1),
817
+ (stride, 1),
818
+ padding=(get_padding(kernel_size, 1), 0),
819
+ )
820
+ ),
821
+ norm_f(
822
+ nn.Conv2d(
823
+ 128,
824
+ 512,
825
+ (kernel_size, 1),
826
+ (stride, 1),
827
+ padding=(get_padding(kernel_size, 1), 0),
828
+ )
829
+ ),
830
+ norm_f(
831
+ nn.Conv2d(
832
+ 512,
833
+ 1024,
834
+ (kernel_size, 1),
835
+ (stride, 1),
836
+ padding=(get_padding(kernel_size, 1), 0),
837
+ )
838
+ ),
839
+ norm_f(
840
+ nn.Conv2d(
841
+ 1024,
842
+ 1024,
843
+ (kernel_size, 1),
844
+ 1,
845
+ padding=(get_padding(kernel_size, 1), 0),
846
+ )
847
+ ),
848
+ ]
849
+ )
850
+ self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
851
+
852
+ def forward(self, x):
853
+ fmap = []
854
+
855
+ # 1d to 2d
856
+ b, c, t = x.shape
857
+ if t % self.period != 0: # pad first
858
+ n_pad = self.period - (t % self.period)
859
+ x = F.pad(x, (0, n_pad), "reflect")
860
+ t = t + n_pad
861
+ x = x.view(b, c, t // self.period, self.period)
862
+
863
+ for l in self.convs:
864
+ x = l(x)
865
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
866
+ fmap.append(x)
867
+ x = self.conv_post(x)
868
+ fmap.append(x)
869
+ x = torch.flatten(x, 1, -1)
870
+
871
+ return x, fmap