OlaWod commited on
Commit
1ae0fad
1 Parent(s): 4bf414e

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -1
  2. .gitignore +3 -0
  3. LICENSE +21 -0
  4. app.py +149 -0
  5. config_v1_16k.json +42 -0
  6. dataset/audio/p225/p225_220.wav +0 -0
  7. dataset/audio/p226/p226_341.wav +0 -0
  8. dataset/audio/p227/p227_021.wav +0 -0
  9. dataset/audio/p228/p228_242.wav +0 -0
  10. dataset/audio/p229/p229_021.wav +0 -0
  11. dataset/audio/p230/p230_361.wav +0 -0
  12. dataset/audio/p231/p231_197.wav +0 -0
  13. dataset/audio/p232/p232_023.wav +0 -0
  14. dataset/audio/p233/p233_323.wav +0 -0
  15. dataset/audio/p234/p234_229.wav +0 -0
  16. dataset/audio/p236/p236_068.wav +0 -0
  17. dataset/audio/p237/p237_023.wav +0 -0
  18. dataset/audio/p238/p238_023.wav +0 -0
  19. dataset/audio/p239/p239_023.wav +0 -0
  20. dataset/audio/p240/p240_004.wav +0 -0
  21. dataset/audio/p241/p241_050.wav +0 -0
  22. dataset/audio/p243/p243_087.wav +0 -0
  23. dataset/audio/p244/p244_008.wav +0 -0
  24. dataset/audio/p245/p245_014.wav +0 -0
  25. dataset/audio/p246/p246_022.wav +0 -0
  26. dataset/audio/p247/p247_380.wav +0 -0
  27. dataset/audio/p248/p248_023.wav +0 -0
  28. dataset/audio/p249/p249_223.wav +0 -0
  29. dataset/audio/p250/p250_021.wav +0 -0
  30. dataset/audio/p251/p251_364.wav +0 -0
  31. dataset/audio/p252/p252_023.wav +0 -0
  32. dataset/audio/p253/p253_207.wav +0 -0
  33. dataset/audio/p254/p254_023.wav +0 -0
  34. dataset/audio/p255/p255_038.wav +0 -0
  35. dataset/audio/p256/p256_079.wav +0 -0
  36. dataset/audio/p257/p257_023.wav +0 -0
  37. dataset/audio/p258/p258_228.wav +0 -0
  38. dataset/audio/p259/p259_011.wav +0 -0
  39. dataset/audio/p260/p260_103.wav +0 -0
  40. dataset/audio/p261/p261_023.wav +0 -0
  41. dataset/audio/p262/p262_210.wav +0 -0
  42. dataset/audio/p263/p263_218.wav +0 -0
  43. dataset/audio/p264/p264_438.wav +0 -0
  44. dataset/audio/p265/p265_273.wav +0 -0
  45. dataset/audio/p266/p266_417.wav +0 -0
  46. dataset/audio/p267/p267_022.wav +0 -0
  47. dataset/audio/p268/p268_021.wav +0 -0
  48. dataset/audio/p269/p269_332.wav +0 -0
  49. dataset/audio/p270/p270_297.wav +0 -0
  50. dataset/audio/p271/p271_170.wav +0 -0
.gitattributes CHANGED
@@ -11,7 +11,7 @@
11
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
  *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
 
11
  *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ # *.npy filter=lfs diff=lfs merge=lfs -text
15
  *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__
2
+ flagged
3
+ out.wav
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Jingyi Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import math
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import librosa
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import gradio as gr
11
+ import openvino as ov
12
+
13
+ from env import AttrDict
14
+ from meldataset import mel_spectrogram, MAX_WAV_VALUE
15
+ from stft import TorchSTFT
16
+
17
+
18
+ # files
19
+ hpfile = "config_v1_16k.json"
20
+ g1path = "exp/g1.xml"
21
+ g2path = "exp/g2.xml"
22
+ spk2id_path = "filelists/spk2id.json"
23
+ f0_stats_path = "filelists/f0_stats.json"
24
+ spk_stats_path = "filelists/spk_stats.json"
25
+ spk_emb_dir = "dataset/spk"
26
+ spk_wav_dir = "dataset/audio"
27
+
28
+ # load config
29
+ with open(hpfile) as f:
30
+ data = f.read()
31
+ json_config = json.loads(data)
32
+ h = AttrDict(json_config)
33
+
34
+ # load models
35
+ core = ov.Core()
36
+ g1 = core.read_model(model=g1path)
37
+ g1 = core.compile_model(model=g1, device_name="CPU")
38
+ g2 = core.read_model(model=g2path)
39
+ g2 = core.compile_model(model=g2, device_name="CPU")
40
+
41
+ stft = TorchSTFT(filter_length=h.gen_istft_n_fft, hop_length=h.gen_istft_hop_size, win_length=h.gen_istft_n_fft)
42
+
43
+ # load stats
44
+ with open(spk2id_path) as f:
45
+ spk2id = json.load(f)
46
+ with open(f0_stats_path) as f:
47
+ f0_stats = json.load(f)
48
+ with open(spk_stats_path) as f:
49
+ spk_stats = json.load(f)
50
+
51
+ # tune f0
52
+ threshold = 10
53
+ step = (math.log(1100) - math.log(50)) / 256
54
+ def tune_f0(initial_f0, i):
55
+ if i == 0:
56
+ return initial_f0
57
+ voiced = initial_f0 > threshold
58
+ initial_lf0 = torch.log(initial_f0)
59
+ lf0 = initial_lf0 + step * i
60
+ f0 = torch.exp(lf0)
61
+ f0 = torch.where(voiced, f0, initial_f0)
62
+ return f0
63
+
64
+ # infer
65
+ def infer(wav, mel, spk_emb, spk_id, f0_mean_tgt):
66
+ # g1
67
+ out = g1([wav, mel, spk_emb, spk_id, f0_mean_tgt])
68
+ x = out[g1.output(0)]
69
+ har_source = out[g1.output(1)]
70
+
71
+ # stft
72
+ har_source = torch.from_numpy(har_source)
73
+ har_spec, har_phase = stft.transform(har_source)
74
+ har_spec, har_phase = har_spec.numpy(), har_phase.numpy()
75
+
76
+ # g2
77
+ out = g2([x, har_spec, har_phase])
78
+ spec = out[g2.output(0)]
79
+ phase = out[g2.output(1)]
80
+
81
+ # istft
82
+ spec, phase = torch.from_numpy(spec), torch.from_numpy(phase)
83
+ y = stft.inverse(spec, phase)
84
+
85
+ return y
86
+
87
+ # convert function
88
+ def convert(tgt_spk, src_wav, f0_shift=0):
89
+ tgt_ref = spk_stats[tgt_spk]["best_spk_emb"]
90
+ tgt_emb = f"{spk_emb_dir}/{tgt_spk}/{tgt_ref}.npy"
91
+
92
+ with torch.no_grad():
93
+ # tgt
94
+ spk_id = spk2id[tgt_spk]
95
+ spk_id = np.array([spk_id], dtype=np.int64)[None, :]
96
+
97
+ spk_emb = np.load(tgt_emb)[None, :]
98
+
99
+ f0_mean_tgt = f0_stats[tgt_spk]["mean"]
100
+ f0_mean_tgt = np.array([f0_mean_tgt], dtype=np.float32)[None, :]
101
+
102
+ # src
103
+ wav, sr = librosa.load(src_wav, sr=16000)
104
+ wav = wav[None, :]
105
+ mel = mel_spectrogram(torch.from_numpy(wav), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax).numpy()
106
+
107
+ # cvt
108
+ y = infer(wav, mel, spk_emb, spk_id, f0_mean_tgt)
109
+
110
+ audio = y.squeeze()
111
+ audio = audio / torch.max(torch.abs(audio)) * 0.95
112
+ audio = audio * MAX_WAV_VALUE
113
+ audio = audio.cpu().numpy().astype('int16')
114
+
115
+ sf.write("out.wav", audio, h.sampling_rate, "PCM_16")
116
+
117
+ out_wav = "out.wav"
118
+ return out_wav
119
+
120
+ # change spk
121
+ def change_spk(tgt_spk):
122
+ tgt_ref = spk_stats[tgt_spk]["best_spk_emb"]
123
+ tgt_wav = f"{spk_wav_dir}/{tgt_spk}/{tgt_ref}.wav"
124
+ return tgt_wav
125
+
126
+ # interface
127
+ with gr.Blocks() as demo:
128
+ gr.Markdown("# PitchVC-vino")
129
+ gr.Markdown("Gradio Demo for PitchVC with OpenVINO on CPU. ([Github Repo](https://github.com/OlaWod/PitchVC))")
130
+
131
+ with gr.Row():
132
+ with gr.Column():
133
+ tgt_spk = gr.Dropdown(choices=spk2id.keys(), type="value", label="Target Speaker")
134
+ ref_audio = gr.Audio(label="Reference Audio", type='filepath')
135
+ src_audio = gr.Audio(label="Source Audio", type='filepath')
136
+ f0_shift = gr.Slider(minimum=-30, maximum=30, value=0, step=1, label="F0 Shift")
137
+ with gr.Column():
138
+ out_audio = gr.Audio(label="Output Audio", type='filepath')
139
+ submit = gr.Button(value="Submit")
140
+
141
+ tgt_spk.change(fn=change_spk, inputs=[tgt_spk], outputs=[ref_audio])
142
+ submit.click(convert, [tgt_spk, src_audio, f0_shift], [out_audio])
143
+
144
+ examples = gr.Examples(
145
+ examples=[["p225", 'dataset/audio/p226/p226_341.wav', 0],
146
+ ["p226", 'dataset/audio/p225/p225_220.wav', -5]],
147
+ inputs=[tgt_spk, src_audio, f0_shift])
148
+
149
+ demo.launch()
config_v1_16k.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "F0_path": "Utils/JDC/bst.t7",
3
+
4
+ "use_aug": true,
5
+
6
+ "resblock": "1",
7
+ "num_gpus": 1,
8
+ "batch_size": 16,
9
+ "learning_rate": 0.0002,
10
+ "adam_b1": 0.8,
11
+ "adam_b2": 0.99,
12
+ "lr_decay": 0.999,
13
+ "seed": 1234,
14
+
15
+ "upsample_rates": [10,8],
16
+ "upsample_kernel_sizes": [20,16],
17
+ "upsample_initial_channel": 512,
18
+ "resblock_kernel_sizes": [3,7,11],
19
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
20
+ "gen_istft_n_fft": 16,
21
+ "gen_istft_hop_size": 4,
22
+
23
+ "segment_size": 16000,
24
+ "num_mels": 80,
25
+ "n_fft": 1024,
26
+ "hop_size": 320,
27
+ "win_size": 1024,
28
+
29
+ "sampling_rate": 16000,
30
+
31
+ "fmin": 0,
32
+ "fmax": 8000,
33
+ "fmax_for_loss": null,
34
+
35
+ "num_workers": 8,
36
+
37
+ "dist_config": {
38
+ "dist_backend": "nccl",
39
+ "dist_url": "tcp://localhost:54321",
40
+ "world_size": 1
41
+ }
42
+ }
dataset/audio/p225/p225_220.wav ADDED
Binary file (101 kB). View file
 
dataset/audio/p226/p226_341.wav ADDED
Binary file (93.3 kB). View file
 
dataset/audio/p227/p227_021.wav ADDED
Binary file (294 kB). View file
 
dataset/audio/p228/p228_242.wav ADDED
Binary file (87.1 kB). View file
 
dataset/audio/p229/p229_021.wav ADDED
Binary file (239 kB). View file
 
dataset/audio/p230/p230_361.wav ADDED
Binary file (132 kB). View file
 
dataset/audio/p231/p231_197.wav ADDED
Binary file (36.9 kB). View file
 
dataset/audio/p232/p232_023.wav ADDED
Binary file (285 kB). View file
 
dataset/audio/p233/p233_323.wav ADDED
Binary file (133 kB). View file
 
dataset/audio/p234/p234_229.wav ADDED
Binary file (73.8 kB). View file
 
dataset/audio/p236/p236_068.wav ADDED
Binary file (89.2 kB). View file
 
dataset/audio/p237/p237_023.wav ADDED
Binary file (272 kB). View file
 
dataset/audio/p238/p238_023.wav ADDED
Binary file (372 kB). View file
 
dataset/audio/p239/p239_023.wav ADDED
Binary file (265 kB). View file
 
dataset/audio/p240/p240_004.wav ADDED
Binary file (119 kB). View file
 
dataset/audio/p241/p241_050.wav ADDED
Binary file (64.6 kB). View file
 
dataset/audio/p243/p243_087.wav ADDED
Binary file (109 kB). View file
 
dataset/audio/p244/p244_008.wav ADDED
Binary file (225 kB). View file
 
dataset/audio/p245/p245_014.wav ADDED
Binary file (154 kB). View file
 
dataset/audio/p246/p246_022.wav ADDED
Binary file (196 kB). View file
 
dataset/audio/p247/p247_380.wav ADDED
Binary file (92.2 kB). View file
 
dataset/audio/p248/p248_023.wav ADDED
Binary file (396 kB). View file
 
dataset/audio/p249/p249_223.wav ADDED
Binary file (116 kB). View file
 
dataset/audio/p250/p250_021.wav ADDED
Binary file (225 kB). View file
 
dataset/audio/p251/p251_364.wav ADDED
Binary file (128 kB). View file
 
dataset/audio/p252/p252_023.wav ADDED
Binary file (324 kB). View file
 
dataset/audio/p253/p253_207.wav ADDED
Binary file (101 kB). View file
 
dataset/audio/p254/p254_023.wav ADDED
Binary file (286 kB). View file
 
dataset/audio/p255/p255_038.wav ADDED
Binary file (114 kB). View file
 
dataset/audio/p256/p256_079.wav ADDED
Binary file (119 kB). View file
 
dataset/audio/p257/p257_023.wav ADDED
Binary file (242 kB). View file
 
dataset/audio/p258/p258_228.wav ADDED
Binary file (89.2 kB). View file
 
dataset/audio/p259/p259_011.wav ADDED
Binary file (191 kB). View file
 
dataset/audio/p260/p260_103.wav ADDED
Binary file (121 kB). View file
 
dataset/audio/p261/p261_023.wav ADDED
Binary file (286 kB). View file
 
dataset/audio/p262/p262_210.wav ADDED
Binary file (118 kB). View file
 
dataset/audio/p263/p263_218.wav ADDED
Binary file (101 kB). View file
 
dataset/audio/p264/p264_438.wav ADDED
Binary file (125 kB). View file
 
dataset/audio/p265/p265_273.wav ADDED
Binary file (119 kB). View file
 
dataset/audio/p266/p266_417.wav ADDED
Binary file (89.2 kB). View file
 
dataset/audio/p267/p267_022.wav ADDED
Binary file (176 kB). View file
 
dataset/audio/p268/p268_021.wav ADDED
Binary file (269 kB). View file
 
dataset/audio/p269/p269_332.wav ADDED
Binary file (92.2 kB). View file
 
dataset/audio/p270/p270_297.wav ADDED
Binary file (96.3 kB). View file
 
dataset/audio/p271/p271_170.wav ADDED
Binary file (151 kB). View file