BoldActionMan commited on
Commit
db31449
1 Parent(s): cfee63d

Added main files

Browse files
Files changed (50) hide show
  1. LICENSE +7 -0
  2. checkpoints_v2/base_speakers/ses/en-au.pth +3 -0
  3. checkpoints_v2/base_speakers/ses/en-br.pth +3 -0
  4. checkpoints_v2/base_speakers/ses/en-default.pth +3 -0
  5. checkpoints_v2/base_speakers/ses/en-india.pth +3 -0
  6. checkpoints_v2/base_speakers/ses/en-newest.pth +3 -0
  7. checkpoints_v2/base_speakers/ses/en-us.pth +3 -0
  8. checkpoints_v2/base_speakers/ses/es.pth +3 -0
  9. checkpoints_v2/base_speakers/ses/fr.pth +3 -0
  10. checkpoints_v2/base_speakers/ses/jp.pth +3 -0
  11. checkpoints_v2/base_speakers/ses/kr.pth +3 -0
  12. checkpoints_v2/base_speakers/ses/zh.pth +3 -0
  13. checkpoints_v2/converter/checkpoint.pth +3 -0
  14. checkpoints_v2/converter/config.json +57 -0
  15. environment.yml +288 -0
  16. openvoice/__init__.py +0 -0
  17. openvoice/__pycache__/__init__.cpython-310.pyc +0 -0
  18. openvoice/__pycache__/api.cpython-310.pyc +0 -0
  19. openvoice/__pycache__/attentions.cpython-310.pyc +0 -0
  20. openvoice/__pycache__/commons.cpython-310.pyc +0 -0
  21. openvoice/__pycache__/mel_processing.cpython-310.pyc +0 -0
  22. openvoice/__pycache__/models.cpython-310.pyc +0 -0
  23. openvoice/__pycache__/modules.cpython-310.pyc +0 -0
  24. openvoice/__pycache__/se_extractor.cpython-310.pyc +0 -0
  25. openvoice/__pycache__/transforms.cpython-310.pyc +0 -0
  26. openvoice/__pycache__/utils.cpython-310.pyc +0 -0
  27. openvoice/api.py +202 -0
  28. openvoice/attentions.py +465 -0
  29. openvoice/commons.py +160 -0
  30. openvoice/mel_processing.py +183 -0
  31. openvoice/models.py +499 -0
  32. openvoice/modules.py +598 -0
  33. openvoice/openvoice_app.py +275 -0
  34. openvoice/se_extractor.py +156 -0
  35. openvoice/text/__init__.py +79 -0
  36. openvoice/text/__pycache__/__init__.cpython-310.pyc +0 -0
  37. openvoice/text/__pycache__/cleaners.cpython-310.pyc +0 -0
  38. openvoice/text/__pycache__/english.cpython-310.pyc +0 -0
  39. openvoice/text/__pycache__/mandarin.cpython-310.pyc +0 -0
  40. openvoice/text/__pycache__/symbols.cpython-310.pyc +0 -0
  41. openvoice/text/cleaners.py +16 -0
  42. openvoice/text/english.py +188 -0
  43. openvoice/text/mandarin.py +326 -0
  44. openvoice/text/symbols.py +88 -0
  45. openvoice/transforms.py +209 -0
  46. openvoice/utils.py +194 -0
  47. requirements.txt +286 -0
  48. setup.py +45 -0
  49. videosource.py +202 -0
  50. videotranslator.py +223 -0
LICENSE ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Copyright 2024 MyShell.ai
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
checkpoints_v2/base_speakers/ses/en-au.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e9782233deef51fc5289d05ad4dd4ce12b196e282eccf6b6db6256bbd02daaa
3
+ size 1701
checkpoints_v2/base_speakers/ses/en-br.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf5a88025cfd10473b25d65d5c0e608338ce4533059c5f9a3383e69c812d389
3
+ size 1701
checkpoints_v2/base_speakers/ses/en-default.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4139de3bc2ea162f45a5a5f9559b710686c9689749b5ab8945ee5e2a082d154
3
+ size 1783
checkpoints_v2/base_speakers/ses/en-india.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad03d946757e95fe9e13239aa4b11071d98f22316f604f34b1a0b4bdf41cda48
3
+ size 1701
checkpoints_v2/base_speakers/ses/en-newest.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a3798229b1114f0e9cc137b33211809def7dda5a8a9398d5a112c0b42699177
3
+ size 1692
checkpoints_v2/base_speakers/ses/en-us.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d092d4af0815a4bfbc6105b65621ab68dc4c61b2f55044d8a66968a34947c32
3
+ size 1701
checkpoints_v2/base_speakers/ses/es.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8cece8853fb75b9f5217a1f5cda9807bac92a3e4c4547fc651e404d05deff63
3
+ size 1692
checkpoints_v2/base_speakers/ses/fr.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a01f6d30a73efa368c288a542a522a2bcdd4e2ec5589d8646b307cf8e2ad9ae
3
+ size 1692
checkpoints_v2/base_speakers/ses/jp.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b645ff428de4a57a22122318968f1e6127ac81fda2e2aa66062deccd3864416
3
+ size 1692
checkpoints_v2/base_speakers/ses/kr.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f501479d6072741a396725bec79144653e9f4a5381b85901e29683aa169795df
3
+ size 1692
checkpoints_v2/base_speakers/ses/zh.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b353de562700c13faacf096ecfc0adcafd26e6704a9feef572be1279714e031
3
+ size 1692
checkpoints_v2/converter/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
3
+ size 131320490
checkpoints_v2/converter/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_version_": "v2",
3
+ "data": {
4
+ "sampling_rate": 22050,
5
+ "filter_length": 1024,
6
+ "hop_length": 256,
7
+ "win_length": 1024,
8
+ "n_speakers": 0
9
+ },
10
+ "model": {
11
+ "zero_g": true,
12
+ "inter_channels": 192,
13
+ "hidden_channels": 192,
14
+ "filter_channels": 768,
15
+ "n_heads": 2,
16
+ "n_layers": 6,
17
+ "kernel_size": 3,
18
+ "p_dropout": 0.1,
19
+ "resblock": "1",
20
+ "resblock_kernel_sizes": [
21
+ 3,
22
+ 7,
23
+ 11
24
+ ],
25
+ "resblock_dilation_sizes": [
26
+ [
27
+ 1,
28
+ 3,
29
+ 5
30
+ ],
31
+ [
32
+ 1,
33
+ 3,
34
+ 5
35
+ ],
36
+ [
37
+ 1,
38
+ 3,
39
+ 5
40
+ ]
41
+ ],
42
+ "upsample_rates": [
43
+ 8,
44
+ 8,
45
+ 2,
46
+ 2
47
+ ],
48
+ "upsample_initial_channel": 512,
49
+ "upsample_kernel_sizes": [
50
+ 16,
51
+ 16,
52
+ 4,
53
+ 4
54
+ ],
55
+ "gin_channels": 256
56
+ }
57
+ }
environment.yml ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: openvoice1
2
+ channels:
3
+ - defaults
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=main
6
+ - _openmp_mutex=5.1=1_gnu
7
+ - bzip2=1.0.8=h5eee18b_6
8
+ - ca-certificates=2024.7.2=h06a4308_0
9
+ - ld_impl_linux-64=2.38=h1181459_1
10
+ - libffi=3.4.4=h6a678d5_1
11
+ - libgcc-ng=11.2.0=h1234567_1
12
+ - libgomp=11.2.0=h1234567_1
13
+ - libstdcxx-ng=11.2.0=h1234567_1
14
+ - libuuid=1.41.5=h5eee18b_0
15
+ - ncurses=6.4=h6a678d5_0
16
+ - openssl=3.0.14=h5eee18b_0
17
+ - pip=24.0=py310h06a4308_0
18
+ - python=3.10.14=h955ad1f_1
19
+ - readline=8.2=h5eee18b_0
20
+ - setuptools=69.5.1=py310h06a4308_0
21
+ - sqlite=3.45.3=h5eee18b_0
22
+ - tk=8.6.14=h39e8969_0
23
+ - wheel=0.43.0=py310h06a4308_0
24
+ - xz=5.4.6=h5eee18b_1
25
+ - zlib=1.2.13=h5eee18b_1
26
+ - pip:
27
+ - absl-py==2.1.0
28
+ - aiofiles==23.2.1
29
+ - altair==5.3.0
30
+ - annotated-types==0.7.0
31
+ - anyascii==0.3.2
32
+ - anyio==4.4.0
33
+ - appdirs==1.4.4
34
+ - asttokens==2.4.1
35
+ - attrs==23.2.0
36
+ - audioread==3.0.1
37
+ - av==10.0.0
38
+ - babel==2.15.0
39
+ - backcall==0.2.0
40
+ - beautifulsoup4==4.12.3
41
+ - bert-extractive-summarizer==0.10.1
42
+ - bleach==6.1.0
43
+ - blis==0.7.11
44
+ - boto3==1.34.143
45
+ - botocore==1.34.143
46
+ - cached-path==1.6.3
47
+ - cachetools==5.3.3
48
+ - catalogue==2.0.10
49
+ - certifi==2024.7.4
50
+ - cffi==1.16.0
51
+ - charset-normalizer==3.3.2
52
+ - click==8.1.7
53
+ - cloudpathlib==0.18.1
54
+ - cn2an==0.5.22
55
+ - coloredlogs==15.0.1
56
+ - confection==0.1.5
57
+ - contourpy==1.2.1
58
+ - cryptography==42.0.8
59
+ - ctranslate2==3.24.0
60
+ - cycler==0.12.1
61
+ - cymem==2.0.8
62
+ - cython==3.0.10
63
+ - dateparser==1.1.8
64
+ - decorator==4.4.2
65
+ - deepfilterlib==0.5.6
66
+ - deepfilternet==0.5.6
67
+ - defusedxml==0.7.1
68
+ - deprecated==1.2.14
69
+ - dill==0.3.8
70
+ - distance==0.1.3
71
+ - dnspython==2.6.1
72
+ - docopt==0.6.2
73
+ - dtw-python==1.4.4
74
+ - email-validator==2.2.0
75
+ - eng-to-ipa==0.0.2
76
+ - exceptiongroup==1.2.1
77
+ - executing==2.0.1
78
+ - fastapi==0.111.0
79
+ - fastapi-cli==0.0.4
80
+ - faster-whisper==0.9.0
81
+ - fastjsonschema==2.20.0
82
+ - ffmpeg-python==0.2.0
83
+ - ffmpy==0.3.2
84
+ - filelock==3.13.4
85
+ - flatbuffers==24.3.25
86
+ - fonttools==4.53.1
87
+ - fsspec==2024.6.1
88
+ - fugashi==1.3.0
89
+ - future==1.0.0
90
+ - g2p-en==2.1.0
91
+ - g2pkk==0.1.2
92
+ - google-api-core==2.19.1
93
+ - google-auth==2.32.0
94
+ - google-cloud-core==2.4.1
95
+ - google-cloud-storage==2.17.0
96
+ - google-crc32c==1.5.0
97
+ - google-resumable-media==2.7.1
98
+ - googleapis-common-protos==1.63.2
99
+ - gradio==4.38.1
100
+ - gradio-client==1.1.0
101
+ - grpcio==1.64.1
102
+ - gruut==2.2.3
103
+ - gruut-ipa==0.13.0
104
+ - gruut-lang-de==2.0.1
105
+ - gruut-lang-en==2.0.1
106
+ - gruut-lang-es==2.0.1
107
+ - gruut-lang-fr==2.0.2
108
+ - h11==0.14.0
109
+ - httpcore==1.0.5
110
+ - httptools==0.6.1
111
+ - httpx==0.27.0
112
+ - huggingface-hub==0.23.4
113
+ - humanfriendly==10.0
114
+ - idna==3.7
115
+ - imageio==2.34.2
116
+ - imageio-ffmpeg==0.5.1
117
+ - importlib-resources==6.4.0
118
+ - inflect==7.0.0
119
+ - ipython==8.12.3
120
+ - jaconv==0.3.4
121
+ - jamo==0.4.1
122
+ - jedi==0.19.1
123
+ - jieba==0.42.1
124
+ - jinja2==3.1.4
125
+ - jmespath==1.0.1
126
+ - joblib==1.4.2
127
+ - jsonlines==1.2.0
128
+ - jsonschema==4.23.0
129
+ - jsonschema-specifications==2023.12.1
130
+ - jupyter-client==8.6.2
131
+ - jupyter-core==5.7.2
132
+ - jupyterlab-pygments==0.3.0
133
+ - kiwisolver==1.4.5
134
+ - langcodes==3.4.0
135
+ - langid==1.1.6
136
+ - language-data==1.2.0
137
+ - libretranslatepy==2.1.1
138
+ - librosa==0.9.1
139
+ - llvmlite==0.43.0
140
+ - loguru==0.7.2
141
+ - lxml==5.2.2
142
+ - marisa-trie==1.2.0
143
+ - markdown==3.6
144
+ - markdown-it-py==3.0.0
145
+ - markupsafe==2.1.5
146
+ - matplotlib==3.8.4
147
+ - matplotlib-inline==0.1.7
148
+ - mdurl==0.1.2
149
+ - mecab-python3==1.0.5
150
+ - melotts==0.1.2
151
+ - mistune==3.0.2
152
+ - more-itertools==10.3.0
153
+ - moviepy==1.0.3
154
+ - mpmath==1.3.0
155
+ - multiprocess==0.70.16
156
+ - murmurhash==1.0.10
157
+ - nbclient==0.10.0
158
+ - nbconvert==7.16.4
159
+ - nbformat==5.10.4
160
+ - networkx==2.8.8
161
+ - nltk==3.8.1
162
+ - noisereduce==3.0.2
163
+ - num2words==0.5.12
164
+ - numba==0.60.0
165
+ - numpy==1.22.0
166
+ - nvidia-cublas-cu12==12.1.3.1
167
+ - nvidia-cuda-cupti-cu12==12.1.105
168
+ - nvidia-cuda-nvrtc-cu12==12.1.105
169
+ - nvidia-cuda-runtime-cu12==12.1.105
170
+ - nvidia-cudnn-cu12==8.9.2.26
171
+ - nvidia-cufft-cu12==11.0.2.54
172
+ - nvidia-curand-cu12==10.3.2.106
173
+ - nvidia-cusolver-cu12==11.4.5.107
174
+ - nvidia-cusparse-cu12==12.1.0.106
175
+ - nvidia-nccl-cu12==2.20.5
176
+ - nvidia-nvjitlink-cu12==12.5.82
177
+ - nvidia-nvtx-cu12==12.1.105
178
+ - onnxruntime==1.18.1
179
+ - openai-whisper==20231117
180
+ - orjson==3.10.6
181
+ - packaging==23.2
182
+ - pandas==2.0.3
183
+ - pandocfilters==1.5.1
184
+ - parso==0.8.4
185
+ - pathos==0.3.2
186
+ - pexpect==4.9.0
187
+ - pickleshare==0.7.5
188
+ - pillow==10.4.0
189
+ - pipreqs==0.5.0
190
+ - plac==1.4.3
191
+ - platformdirs==4.2.2
192
+ - pooch==1.8.2
193
+ - pox==0.3.4
194
+ - ppft==1.7.6.8
195
+ - preshed==3.0.9
196
+ - proces==0.1.7
197
+ - proglog==0.1.10
198
+ - prompt-toolkit==3.0.47
199
+ - proto-plus==1.24.0
200
+ - protobuf==5.27.2
201
+ - ptyprocess==0.7.0
202
+ - pure-eval==0.2.2
203
+ - pyasn1==0.6.0
204
+ - pyasn1-modules==0.4.0
205
+ - pycparser==2.22
206
+ - pydantic==2.8.2
207
+ - pydantic-core==2.20.1
208
+ - pydub==0.25.1
209
+ - pyexecjs==1.5.1
210
+ - pygments==2.18.0
211
+ - pykakasi==2.2.1
212
+ - pyparsing==3.1.2
213
+ - pypinyin==0.50.0
214
+ - python-crfsuite==0.9.10
215
+ - python-dateutil==2.9.0.post0
216
+ - python-dotenv==1.0.1
217
+ - python-mecab-ko==1.3.7
218
+ - python-mecab-ko-dic==2.1.1.post2
219
+ - python-multipart==0.0.9
220
+ - pytz==2024.1
221
+ - pyyaml==6.0.1
222
+ - pyzmq==26.0.3
223
+ - referencing==0.35.1
224
+ - regex==2024.5.15
225
+ - requests==2.32.3
226
+ - resampy==0.4.3
227
+ - rich==13.7.1
228
+ - rpds-py==0.19.0
229
+ - rsa==4.9
230
+ - ruff==0.5.2
231
+ - s3transfer==0.10.2
232
+ - scikit-learn==1.5.1
233
+ - scipy==1.11.4
234
+ - semantic-version==2.10.0
235
+ - shellingham==1.5.4
236
+ - six==1.16.0
237
+ - smart-open==7.0.4
238
+ - sniffio==1.3.1
239
+ - soundfile==0.12.1
240
+ - soupsieve==2.5
241
+ - spacy==3.7.5
242
+ - spacy-legacy==3.0.12
243
+ - spacy-loggers==1.0.5
244
+ - srsly==2.4.8
245
+ - stack-data==0.6.3
246
+ - starlette==0.37.2
247
+ - sympy==1.13.0
248
+ - tensorboard==2.16.2
249
+ - tensorboard-data-server==0.7.2
250
+ - thinc==8.2.5
251
+ - threadpoolctl==3.5.0
252
+ - tiktoken==0.7.0
253
+ - tinycss2==1.3.0
254
+ - tokenizers==0.13.3
255
+ - tomlkit==0.12.0
256
+ - toolz==0.12.1
257
+ - torch==2.3.1
258
+ - torchaudio==2.3.1+cpu
259
+ - tornado==6.4.1
260
+ - tqdm==4.66.4
261
+ - traitlets==5.14.3
262
+ - transformers==4.27.4
263
+ - translators==5.9.2
264
+ - triton==2.3.1
265
+ - txtsplit==1.0.0
266
+ - typer==0.12.3
267
+ - typing-extensions==4.12.2
268
+ - tzdata==2024.1
269
+ - tzlocal==5.2
270
+ - ujson==5.10.0
271
+ - unidecode==1.3.7
272
+ - unidic==1.1.0
273
+ - unidic-lite==1.0.8
274
+ - urllib3==2.2.2
275
+ - uvicorn==0.30.1
276
+ - uvloop==0.19.0
277
+ - wasabi==0.10.1
278
+ - watchfiles==0.22.0
279
+ - wavmark==0.0.3
280
+ - wcwidth==0.2.13
281
+ - weasel==0.4.1
282
+ - webencodings==0.5.1
283
+ - websockets==11.0.3
284
+ - werkzeug==3.0.3
285
+ - whisper-timestamped==1.14.2
286
+ - wrapt==1.16.0
287
+ - yarg==0.1.9
288
+ prefix: /home/wty/anaconda3/envs/openvoice1
openvoice/__init__.py ADDED
File without changes
openvoice/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (134 Bytes). View file
 
openvoice/__pycache__/api.cpython-310.pyc ADDED
Binary file (7.3 kB). View file
 
openvoice/__pycache__/attentions.cpython-310.pyc ADDED
Binary file (11.1 kB). View file
 
openvoice/__pycache__/commons.cpython-310.pyc ADDED
Binary file (5.69 kB). View file
 
openvoice/__pycache__/mel_processing.cpython-310.pyc ADDED
Binary file (4.13 kB). View file
 
openvoice/__pycache__/models.cpython-310.pyc ADDED
Binary file (12.7 kB). View file
 
openvoice/__pycache__/modules.cpython-310.pyc ADDED
Binary file (12.6 kB). View file
 
openvoice/__pycache__/se_extractor.cpython-310.pyc ADDED
Binary file (4.21 kB). View file
 
openvoice/__pycache__/transforms.cpython-310.pyc ADDED
Binary file (3.9 kB). View file
 
openvoice/__pycache__/utils.cpython-310.pyc ADDED
Binary file (6.18 kB). View file
 
openvoice/api.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import re
4
+ import soundfile
5
+ from openvoice import utils
6
+ from openvoice import commons
7
+ import os
8
+ import librosa
9
+ from openvoice.text import text_to_sequence
10
+ from openvoice.mel_processing import spectrogram_torch
11
+ from openvoice.models import SynthesizerTrn
12
+
13
+
14
+ class OpenVoiceBaseClass(object):
15
+ def __init__(self,
16
+ config_path,
17
+ device='cuda:0'):
18
+ if 'cuda' in device:
19
+ assert torch.cuda.is_available()
20
+
21
+ hps = utils.get_hparams_from_file(config_path)
22
+
23
+ model = SynthesizerTrn(
24
+ len(getattr(hps, 'symbols', [])),
25
+ hps.data.filter_length // 2 + 1,
26
+ n_speakers=hps.data.n_speakers,
27
+ **hps.model,
28
+ ).to(device)
29
+
30
+ model.eval()
31
+ self.model = model
32
+ self.hps = hps
33
+ self.device = device
34
+
35
+ def load_ckpt(self, ckpt_path):
36
+ checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
37
+ a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
38
+ print("Loaded checkpoint '{}'".format(ckpt_path))
39
+ print('missing/unexpected keys:', a, b)
40
+
41
+
42
+ class BaseSpeakerTTS(OpenVoiceBaseClass):
43
+ language_marks = {
44
+ "english": "EN",
45
+ "chinese": "ZH",
46
+ }
47
+
48
+ @staticmethod
49
+ def get_text(text, hps, is_symbol):
50
+ text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
51
+ if hps.data.add_blank:
52
+ text_norm = commons.intersperse(text_norm, 0)
53
+ text_norm = torch.LongTensor(text_norm)
54
+ return text_norm
55
+
56
+ @staticmethod
57
+ def audio_numpy_concat(segment_data_list, sr, speed=1.):
58
+ audio_segments = []
59
+ for segment_data in segment_data_list:
60
+ audio_segments += segment_data.reshape(-1).tolist()
61
+ audio_segments += [0] * int((sr * 0.05)/speed)
62
+ audio_segments = np.array(audio_segments).astype(np.float32)
63
+ return audio_segments
64
+
65
+ @staticmethod
66
+ def split_sentences_into_pieces(text, language_str):
67
+ texts = utils.split_sentence(text, language_str=language_str)
68
+ print(" > Text splitted to sentences.")
69
+ print('\n'.join(texts))
70
+ print(" > ===========================")
71
+ return texts
72
+
73
+ def tts(self, text, output_path, speaker, language='English', speed=1.0):
74
+ mark = self.language_marks.get(language.lower(), None)
75
+ assert mark is not None, f"language {language} is not supported"
76
+
77
+ texts = self.split_sentences_into_pieces(text, mark)
78
+
79
+ audio_list = []
80
+ for t in texts:
81
+ t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
82
+ t = f'[{mark}]{t}[{mark}]'
83
+ stn_tst = self.get_text(t, self.hps, False)
84
+ device = self.device
85
+ speaker_id = self.hps.speakers[speaker]
86
+ with torch.no_grad():
87
+ x_tst = stn_tst.unsqueeze(0).to(device)
88
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
89
+ sid = torch.LongTensor([speaker_id]).to(device)
90
+ audio = self.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6,
91
+ length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
92
+ audio_list.append(audio)
93
+ audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)
94
+
95
+ if output_path is None:
96
+ return audio
97
+ else:
98
+ soundfile.write(output_path, audio, self.hps.data.sampling_rate)
99
+
100
+
101
+ class ToneColorConverter(OpenVoiceBaseClass):
102
+ def __init__(self, *args, **kwargs):
103
+ super().__init__(*args, **kwargs)
104
+
105
+ if kwargs.get('enable_watermark', True):
106
+ import wavmark
107
+ self.watermark_model = wavmark.load_model().to(self.device)
108
+ else:
109
+ self.watermark_model = None
110
+ self.version = getattr(self.hps, '_version_', "v1")
111
+
112
+
113
+
114
+ def extract_se(self, ref_wav_list, se_save_path=None):
115
+ if isinstance(ref_wav_list, str):
116
+ ref_wav_list = [ref_wav_list]
117
+
118
+ device = self.device
119
+ hps = self.hps
120
+ gs = []
121
+
122
+ for fname in ref_wav_list:
123
+ audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
124
+ y = torch.FloatTensor(audio_ref)
125
+ y = y.to(device)
126
+ y = y.unsqueeze(0)
127
+ y = spectrogram_torch(y, hps.data.filter_length,
128
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
129
+ center=False).to(device)
130
+ with torch.no_grad():
131
+ g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
132
+ gs.append(g.detach())
133
+ gs = torch.stack(gs).mean(0)
134
+
135
+ if se_save_path is not None:
136
+ os.makedirs(os.path.dirname(se_save_path), exist_ok=True)
137
+ torch.save(gs.cpu(), se_save_path)
138
+
139
+ return gs
140
+
141
+ def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
142
+ hps = self.hps
143
+ # load audio
144
+ audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
145
+ audio = torch.tensor(audio).float()
146
+
147
+ with torch.no_grad():
148
+ y = torch.FloatTensor(audio).to(self.device)
149
+ y = y.unsqueeze(0)
150
+ spec = spectrogram_torch(y, hps.data.filter_length,
151
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
152
+ center=False).to(self.device)
153
+ spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
154
+ audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
155
+ 0, 0].data.cpu().float().numpy()
156
+ audio = self.add_watermark(audio, message)
157
+ if output_path is None:
158
+ return audio
159
+ else:
160
+ soundfile.write(output_path, audio, hps.data.sampling_rate)
161
+
162
+ def add_watermark(self, audio, message):
163
+ if self.watermark_model is None:
164
+ return audio
165
+ device = self.device
166
+ bits = utils.string_to_bits(message).reshape(-1)
167
+ n_repeat = len(bits) // 32
168
+
169
+ K = 16000
170
+ coeff = 2
171
+ for n in range(n_repeat):
172
+ trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
173
+ if len(trunck) != K:
174
+ print('Audio too short, fail to add watermark')
175
+ break
176
+ message_npy = bits[n * 32: (n + 1) * 32]
177
+
178
+ with torch.no_grad():
179
+ signal = torch.FloatTensor(trunck).to(device)[None]
180
+ message_tensor = torch.FloatTensor(message_npy).to(device)[None]
181
+ signal_wmd_tensor = self.watermark_model.encode(signal, message_tensor)
182
+ signal_wmd_npy = signal_wmd_tensor.detach().cpu().squeeze()
183
+ audio[(coeff * n) * K: (coeff * n + 1) * K] = signal_wmd_npy
184
+ return audio
185
+
186
+ def detect_watermark(self, audio, n_repeat):
187
+ bits = []
188
+ K = 16000
189
+ coeff = 2
190
+ for n in range(n_repeat):
191
+ trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
192
+ if len(trunck) != K:
193
+ print('Audio too short, fail to detect watermark')
194
+ return 'Fail'
195
+ with torch.no_grad():
196
+ signal = torch.FloatTensor(trunck).to(self.device).unsqueeze(0)
197
+ message_decoded_npy = (self.watermark_model.decode(signal) >= 0.5).int().detach().cpu().numpy().squeeze()
198
+ bits.append(message_decoded_npy)
199
+ bits = np.stack(bits).reshape(-1, 8)
200
+ message = utils.bits_to_string(bits)
201
+ return message
202
+
openvoice/attentions.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from openvoice import commons
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class LayerNorm(nn.Module):
13
+ def __init__(self, channels, eps=1e-5):
14
+ super().__init__()
15
+ self.channels = channels
16
+ self.eps = eps
17
+
18
+ self.gamma = nn.Parameter(torch.ones(channels))
19
+ self.beta = nn.Parameter(torch.zeros(channels))
20
+
21
+ def forward(self, x):
22
+ x = x.transpose(1, -1)
23
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
24
+ return x.transpose(1, -1)
25
+
26
+
27
+ @torch.jit.script
28
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
29
+ n_channels_int = n_channels[0]
30
+ in_act = input_a + input_b
31
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
32
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
33
+ acts = t_act * s_act
34
+ return acts
35
+
36
+
37
+ class Encoder(nn.Module):
38
+ def __init__(
39
+ self,
40
+ hidden_channels,
41
+ filter_channels,
42
+ n_heads,
43
+ n_layers,
44
+ kernel_size=1,
45
+ p_dropout=0.0,
46
+ window_size=4,
47
+ isflow=True,
48
+ **kwargs
49
+ ):
50
+ super().__init__()
51
+ self.hidden_channels = hidden_channels
52
+ self.filter_channels = filter_channels
53
+ self.n_heads = n_heads
54
+ self.n_layers = n_layers
55
+ self.kernel_size = kernel_size
56
+ self.p_dropout = p_dropout
57
+ self.window_size = window_size
58
+ # if isflow:
59
+ # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
60
+ # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
61
+ # self.cond_layer = weight_norm(cond_layer, name='weight')
62
+ # self.gin_channels = 256
63
+ self.cond_layer_idx = self.n_layers
64
+ if "gin_channels" in kwargs:
65
+ self.gin_channels = kwargs["gin_channels"]
66
+ if self.gin_channels != 0:
67
+ self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
68
+ # vits2 says 3rd block, so idx is 2 by default
69
+ self.cond_layer_idx = (
70
+ kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
71
+ )
72
+ # logging.debug(self.gin_channels, self.cond_layer_idx)
73
+ assert (
74
+ self.cond_layer_idx < self.n_layers
75
+ ), "cond_layer_idx should be less than n_layers"
76
+ self.drop = nn.Dropout(p_dropout)
77
+ self.attn_layers = nn.ModuleList()
78
+ self.norm_layers_1 = nn.ModuleList()
79
+ self.ffn_layers = nn.ModuleList()
80
+ self.norm_layers_2 = nn.ModuleList()
81
+
82
+ for i in range(self.n_layers):
83
+ self.attn_layers.append(
84
+ MultiHeadAttention(
85
+ hidden_channels,
86
+ hidden_channels,
87
+ n_heads,
88
+ p_dropout=p_dropout,
89
+ window_size=window_size,
90
+ )
91
+ )
92
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
93
+ self.ffn_layers.append(
94
+ FFN(
95
+ hidden_channels,
96
+ hidden_channels,
97
+ filter_channels,
98
+ kernel_size,
99
+ p_dropout=p_dropout,
100
+ )
101
+ )
102
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
103
+
104
+ def forward(self, x, x_mask, g=None):
105
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
106
+ x = x * x_mask
107
+ for i in range(self.n_layers):
108
+ if i == self.cond_layer_idx and g is not None:
109
+ g = self.spk_emb_linear(g.transpose(1, 2))
110
+ g = g.transpose(1, 2)
111
+ x = x + g
112
+ x = x * x_mask
113
+ y = self.attn_layers[i](x, x, attn_mask)
114
+ y = self.drop(y)
115
+ x = self.norm_layers_1[i](x + y)
116
+
117
+ y = self.ffn_layers[i](x, x_mask)
118
+ y = self.drop(y)
119
+ x = self.norm_layers_2[i](x + y)
120
+ x = x * x_mask
121
+ return x
122
+
123
+
124
+ class Decoder(nn.Module):
125
+ def __init__(
126
+ self,
127
+ hidden_channels,
128
+ filter_channels,
129
+ n_heads,
130
+ n_layers,
131
+ kernel_size=1,
132
+ p_dropout=0.0,
133
+ proximal_bias=False,
134
+ proximal_init=True,
135
+ **kwargs
136
+ ):
137
+ super().__init__()
138
+ self.hidden_channels = hidden_channels
139
+ self.filter_channels = filter_channels
140
+ self.n_heads = n_heads
141
+ self.n_layers = n_layers
142
+ self.kernel_size = kernel_size
143
+ self.p_dropout = p_dropout
144
+ self.proximal_bias = proximal_bias
145
+ self.proximal_init = proximal_init
146
+
147
+ self.drop = nn.Dropout(p_dropout)
148
+ self.self_attn_layers = nn.ModuleList()
149
+ self.norm_layers_0 = nn.ModuleList()
150
+ self.encdec_attn_layers = nn.ModuleList()
151
+ self.norm_layers_1 = nn.ModuleList()
152
+ self.ffn_layers = nn.ModuleList()
153
+ self.norm_layers_2 = nn.ModuleList()
154
+ for i in range(self.n_layers):
155
+ self.self_attn_layers.append(
156
+ MultiHeadAttention(
157
+ hidden_channels,
158
+ hidden_channels,
159
+ n_heads,
160
+ p_dropout=p_dropout,
161
+ proximal_bias=proximal_bias,
162
+ proximal_init=proximal_init,
163
+ )
164
+ )
165
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
166
+ self.encdec_attn_layers.append(
167
+ MultiHeadAttention(
168
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
169
+ )
170
+ )
171
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
172
+ self.ffn_layers.append(
173
+ FFN(
174
+ hidden_channels,
175
+ hidden_channels,
176
+ filter_channels,
177
+ kernel_size,
178
+ p_dropout=p_dropout,
179
+ causal=True,
180
+ )
181
+ )
182
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
183
+
184
+ def forward(self, x, x_mask, h, h_mask):
185
+ """
186
+ x: decoder input
187
+ h: encoder output
188
+ """
189
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
190
+ device=x.device, dtype=x.dtype
191
+ )
192
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
193
+ x = x * x_mask
194
+ for i in range(self.n_layers):
195
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
196
+ y = self.drop(y)
197
+ x = self.norm_layers_0[i](x + y)
198
+
199
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
200
+ y = self.drop(y)
201
+ x = self.norm_layers_1[i](x + y)
202
+
203
+ y = self.ffn_layers[i](x, x_mask)
204
+ y = self.drop(y)
205
+ x = self.norm_layers_2[i](x + y)
206
+ x = x * x_mask
207
+ return x
208
+
209
+
210
+ class MultiHeadAttention(nn.Module):
211
+ def __init__(
212
+ self,
213
+ channels,
214
+ out_channels,
215
+ n_heads,
216
+ p_dropout=0.0,
217
+ window_size=None,
218
+ heads_share=True,
219
+ block_length=None,
220
+ proximal_bias=False,
221
+ proximal_init=False,
222
+ ):
223
+ super().__init__()
224
+ assert channels % n_heads == 0
225
+
226
+ self.channels = channels
227
+ self.out_channels = out_channels
228
+ self.n_heads = n_heads
229
+ self.p_dropout = p_dropout
230
+ self.window_size = window_size
231
+ self.heads_share = heads_share
232
+ self.block_length = block_length
233
+ self.proximal_bias = proximal_bias
234
+ self.proximal_init = proximal_init
235
+ self.attn = None
236
+
237
+ self.k_channels = channels // n_heads
238
+ self.conv_q = nn.Conv1d(channels, channels, 1)
239
+ self.conv_k = nn.Conv1d(channels, channels, 1)
240
+ self.conv_v = nn.Conv1d(channels, channels, 1)
241
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
242
+ self.drop = nn.Dropout(p_dropout)
243
+
244
+ if window_size is not None:
245
+ n_heads_rel = 1 if heads_share else n_heads
246
+ rel_stddev = self.k_channels**-0.5
247
+ self.emb_rel_k = nn.Parameter(
248
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
249
+ * rel_stddev
250
+ )
251
+ self.emb_rel_v = nn.Parameter(
252
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
253
+ * rel_stddev
254
+ )
255
+
256
+ nn.init.xavier_uniform_(self.conv_q.weight)
257
+ nn.init.xavier_uniform_(self.conv_k.weight)
258
+ nn.init.xavier_uniform_(self.conv_v.weight)
259
+ if proximal_init:
260
+ with torch.no_grad():
261
+ self.conv_k.weight.copy_(self.conv_q.weight)
262
+ self.conv_k.bias.copy_(self.conv_q.bias)
263
+
264
+ def forward(self, x, c, attn_mask=None):
265
+ q = self.conv_q(x)
266
+ k = self.conv_k(c)
267
+ v = self.conv_v(c)
268
+
269
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
270
+
271
+ x = self.conv_o(x)
272
+ return x
273
+
274
+ def attention(self, query, key, value, mask=None):
275
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
276
+ b, d, t_s, t_t = (*key.size(), query.size(2))
277
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
278
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
279
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
280
+
281
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
282
+ if self.window_size is not None:
283
+ assert (
284
+ t_s == t_t
285
+ ), "Relative attention is only available for self-attention."
286
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
287
+ rel_logits = self._matmul_with_relative_keys(
288
+ query / math.sqrt(self.k_channels), key_relative_embeddings
289
+ )
290
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
291
+ scores = scores + scores_local
292
+ if self.proximal_bias:
293
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
294
+ scores = scores + self._attention_bias_proximal(t_s).to(
295
+ device=scores.device, dtype=scores.dtype
296
+ )
297
+ if mask is not None:
298
+ scores = scores.masked_fill(mask == 0, -1e4)
299
+ if self.block_length is not None:
300
+ assert (
301
+ t_s == t_t
302
+ ), "Local attention is only available for self-attention."
303
+ block_mask = (
304
+ torch.ones_like(scores)
305
+ .triu(-self.block_length)
306
+ .tril(self.block_length)
307
+ )
308
+ scores = scores.masked_fill(block_mask == 0, -1e4)
309
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
310
+ p_attn = self.drop(p_attn)
311
+ output = torch.matmul(p_attn, value)
312
+ if self.window_size is not None:
313
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
314
+ value_relative_embeddings = self._get_relative_embeddings(
315
+ self.emb_rel_v, t_s
316
+ )
317
+ output = output + self._matmul_with_relative_values(
318
+ relative_weights, value_relative_embeddings
319
+ )
320
+ output = (
321
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
322
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
323
+ return output, p_attn
324
+
325
+ def _matmul_with_relative_values(self, x, y):
326
+ """
327
+ x: [b, h, l, m]
328
+ y: [h or 1, m, d]
329
+ ret: [b, h, l, d]
330
+ """
331
+ ret = torch.matmul(x, y.unsqueeze(0))
332
+ return ret
333
+
334
+ def _matmul_with_relative_keys(self, x, y):
335
+ """
336
+ x: [b, h, l, d]
337
+ y: [h or 1, m, d]
338
+ ret: [b, h, l, m]
339
+ """
340
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
341
+ return ret
342
+
343
+ def _get_relative_embeddings(self, relative_embeddings, length):
344
+ 2 * self.window_size + 1
345
+ # Pad first before slice to avoid using cond ops.
346
+ pad_length = max(length - (self.window_size + 1), 0)
347
+ slice_start_position = max((self.window_size + 1) - length, 0)
348
+ slice_end_position = slice_start_position + 2 * length - 1
349
+ if pad_length > 0:
350
+ padded_relative_embeddings = F.pad(
351
+ relative_embeddings,
352
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
353
+ )
354
+ else:
355
+ padded_relative_embeddings = relative_embeddings
356
+ used_relative_embeddings = padded_relative_embeddings[
357
+ :, slice_start_position:slice_end_position
358
+ ]
359
+ return used_relative_embeddings
360
+
361
+ def _relative_position_to_absolute_position(self, x):
362
+ """
363
+ x: [b, h, l, 2*l-1]
364
+ ret: [b, h, l, l]
365
+ """
366
+ batch, heads, length, _ = x.size()
367
+ # Concat columns of pad to shift from relative to absolute indexing.
368
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
369
+
370
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
371
+ x_flat = x.view([batch, heads, length * 2 * length])
372
+ x_flat = F.pad(
373
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
374
+ )
375
+
376
+ # Reshape and slice out the padded elements.
377
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
378
+ :, :, :length, length - 1 :
379
+ ]
380
+ return x_final
381
+
382
+ def _absolute_position_to_relative_position(self, x):
383
+ """
384
+ x: [b, h, l, l]
385
+ ret: [b, h, l, 2*l-1]
386
+ """
387
+ batch, heads, length, _ = x.size()
388
+ # pad along column
389
+ x = F.pad(
390
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
391
+ )
392
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
393
+ # add 0's in the beginning that will skew the elements after reshape
394
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
395
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
396
+ return x_final
397
+
398
+ def _attention_bias_proximal(self, length):
399
+ """Bias for self-attention to encourage attention to close positions.
400
+ Args:
401
+ length: an integer scalar.
402
+ Returns:
403
+ a Tensor with shape [1, 1, length, length]
404
+ """
405
+ r = torch.arange(length, dtype=torch.float32)
406
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
407
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
408
+
409
+
410
+ class FFN(nn.Module):
411
+ def __init__(
412
+ self,
413
+ in_channels,
414
+ out_channels,
415
+ filter_channels,
416
+ kernel_size,
417
+ p_dropout=0.0,
418
+ activation=None,
419
+ causal=False,
420
+ ):
421
+ super().__init__()
422
+ self.in_channels = in_channels
423
+ self.out_channels = out_channels
424
+ self.filter_channels = filter_channels
425
+ self.kernel_size = kernel_size
426
+ self.p_dropout = p_dropout
427
+ self.activation = activation
428
+ self.causal = causal
429
+
430
+ if causal:
431
+ self.padding = self._causal_padding
432
+ else:
433
+ self.padding = self._same_padding
434
+
435
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
436
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
437
+ self.drop = nn.Dropout(p_dropout)
438
+
439
+ def forward(self, x, x_mask):
440
+ x = self.conv_1(self.padding(x * x_mask))
441
+ if self.activation == "gelu":
442
+ x = x * torch.sigmoid(1.702 * x)
443
+ else:
444
+ x = torch.relu(x)
445
+ x = self.drop(x)
446
+ x = self.conv_2(self.padding(x * x_mask))
447
+ return x * x_mask
448
+
449
+ def _causal_padding(self, x):
450
+ if self.kernel_size == 1:
451
+ return x
452
+ pad_l = self.kernel_size - 1
453
+ pad_r = 0
454
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
455
+ x = F.pad(x, commons.convert_pad_shape(padding))
456
+ return x
457
+
458
+ def _same_padding(self, x):
459
+ if self.kernel_size == 1:
460
+ return x
461
+ pad_l = (self.kernel_size - 1) // 2
462
+ pad_r = self.kernel_size // 2
463
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
464
+ x = F.pad(x, commons.convert_pad_shape(padding))
465
+ return x
openvoice/commons.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch.nn import functional as F
4
+
5
+
6
+ def init_weights(m, mean=0.0, std=0.01):
7
+ classname = m.__class__.__name__
8
+ if classname.find("Conv") != -1:
9
+ m.weight.data.normal_(mean, std)
10
+
11
+
12
+ def get_padding(kernel_size, dilation=1):
13
+ return int((kernel_size * dilation - dilation) / 2)
14
+
15
+
16
+ def convert_pad_shape(pad_shape):
17
+ layer = pad_shape[::-1]
18
+ pad_shape = [item for sublist in layer for item in sublist]
19
+ return pad_shape
20
+
21
+
22
+ def intersperse(lst, item):
23
+ result = [item] * (len(lst) * 2 + 1)
24
+ result[1::2] = lst
25
+ return result
26
+
27
+
28
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
29
+ """KL(P||Q)"""
30
+ kl = (logs_q - logs_p) - 0.5
31
+ kl += (
32
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
33
+ )
34
+ return kl
35
+
36
+
37
+ def rand_gumbel(shape):
38
+ """Sample from the Gumbel distribution, protect from overflows."""
39
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40
+ return -torch.log(-torch.log(uniform_samples))
41
+
42
+
43
+ def rand_gumbel_like(x):
44
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
+ return g
46
+
47
+
48
+ def slice_segments(x, ids_str, segment_size=4):
49
+ ret = torch.zeros_like(x[:, :, :segment_size])
50
+ for i in range(x.size(0)):
51
+ idx_str = ids_str[i]
52
+ idx_end = idx_str + segment_size
53
+ ret[i] = x[i, :, idx_str:idx_end]
54
+ return ret
55
+
56
+
57
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
58
+ b, d, t = x.size()
59
+ if x_lengths is None:
60
+ x_lengths = t
61
+ ids_str_max = x_lengths - segment_size + 1
62
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
63
+ ret = slice_segments(x, ids_str, segment_size)
64
+ return ret, ids_str
65
+
66
+
67
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
68
+ position = torch.arange(length, dtype=torch.float)
69
+ num_timescales = channels // 2
70
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
71
+ num_timescales - 1
72
+ )
73
+ inv_timescales = min_timescale * torch.exp(
74
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
75
+ )
76
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
77
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
78
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
79
+ signal = signal.view(1, channels, length)
80
+ return signal
81
+
82
+
83
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
84
+ b, channels, length = x.size()
85
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
86
+ return x + signal.to(dtype=x.dtype, device=x.device)
87
+
88
+
89
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
90
+ b, channels, length = x.size()
91
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
92
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
93
+
94
+
95
+ def subsequent_mask(length):
96
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
97
+ return mask
98
+
99
+
100
+ @torch.jit.script
101
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102
+ n_channels_int = n_channels[0]
103
+ in_act = input_a + input_b
104
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
105
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106
+ acts = t_act * s_act
107
+ return acts
108
+
109
+
110
+ def convert_pad_shape(pad_shape):
111
+ layer = pad_shape[::-1]
112
+ pad_shape = [item for sublist in layer for item in sublist]
113
+ return pad_shape
114
+
115
+
116
+ def shift_1d(x):
117
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118
+ return x
119
+
120
+
121
+ def sequence_mask(length, max_length=None):
122
+ if max_length is None:
123
+ max_length = length.max()
124
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125
+ return x.unsqueeze(0) < length.unsqueeze(1)
126
+
127
+
128
+ def generate_path(duration, mask):
129
+ """
130
+ duration: [b, 1, t_x]
131
+ mask: [b, 1, t_y, t_x]
132
+ """
133
+
134
+ b, _, t_y, t_x = mask.shape
135
+ cum_duration = torch.cumsum(duration, -1)
136
+
137
+ cum_duration_flat = cum_duration.view(b * t_x)
138
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
139
+ path = path.view(b, t_x, t_y)
140
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
141
+ path = path.unsqueeze(1).transpose(2, 3) * mask
142
+ return path
143
+
144
+
145
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
146
+ if isinstance(parameters, torch.Tensor):
147
+ parameters = [parameters]
148
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
149
+ norm_type = float(norm_type)
150
+ if clip_value is not None:
151
+ clip_value = float(clip_value)
152
+
153
+ total_norm = 0
154
+ for p in parameters:
155
+ param_norm = p.grad.data.norm(norm_type)
156
+ total_norm += param_norm.item() ** norm_type
157
+ if clip_value is not None:
158
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
159
+ total_norm = total_norm ** (1.0 / norm_type)
160
+ return total_norm
openvoice/mel_processing.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.utils.data
3
+ from librosa.filters import mel as librosa_mel_fn
4
+
5
+ MAX_WAV_VALUE = 32768.0
6
+
7
+
8
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
9
+ """
10
+ PARAMS
11
+ ------
12
+ C: compression factor
13
+ """
14
+ return torch.log(torch.clamp(x, min=clip_val) * C)
15
+
16
+
17
+ def dynamic_range_decompression_torch(x, C=1):
18
+ """
19
+ PARAMS
20
+ ------
21
+ C: compression factor used to compress
22
+ """
23
+ return torch.exp(x) / C
24
+
25
+
26
+ def spectral_normalize_torch(magnitudes):
27
+ output = dynamic_range_compression_torch(magnitudes)
28
+ return output
29
+
30
+
31
+ def spectral_de_normalize_torch(magnitudes):
32
+ output = dynamic_range_decompression_torch(magnitudes)
33
+ return output
34
+
35
+
36
+ mel_basis = {}
37
+ hann_window = {}
38
+
39
+
40
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
41
+ if torch.min(y) < -1.1:
42
+ print("min value is ", torch.min(y))
43
+ if torch.max(y) > 1.1:
44
+ print("max value is ", torch.max(y))
45
+
46
+ global hann_window
47
+ dtype_device = str(y.dtype) + "_" + str(y.device)
48
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
49
+ if wnsize_dtype_device not in hann_window:
50
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
51
+ dtype=y.dtype, device=y.device
52
+ )
53
+
54
+ y = torch.nn.functional.pad(
55
+ y.unsqueeze(1),
56
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
57
+ mode="reflect",
58
+ )
59
+ y = y.squeeze(1)
60
+
61
+ spec = torch.stft(
62
+ y,
63
+ n_fft,
64
+ hop_length=hop_size,
65
+ win_length=win_size,
66
+ window=hann_window[wnsize_dtype_device],
67
+ center=center,
68
+ pad_mode="reflect",
69
+ normalized=False,
70
+ onesided=True,
71
+ return_complex=False,
72
+ )
73
+
74
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
75
+ return spec
76
+
77
+
78
+ def spectrogram_torch_conv(y, n_fft, sampling_rate, hop_size, win_size, center=False):
79
+ # if torch.min(y) < -1.:
80
+ # print('min value is ', torch.min(y))
81
+ # if torch.max(y) > 1.:
82
+ # print('max value is ', torch.max(y))
83
+
84
+ global hann_window
85
+ dtype_device = str(y.dtype) + '_' + str(y.device)
86
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
87
+ if wnsize_dtype_device not in hann_window:
88
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
89
+
90
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
91
+
92
+ # ******************** original ************************#
93
+ # y = y.squeeze(1)
94
+ # spec1 = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
95
+ # center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
96
+
97
+ # ******************** ConvSTFT ************************#
98
+ freq_cutoff = n_fft // 2 + 1
99
+ fourier_basis = torch.view_as_real(torch.fft.fft(torch.eye(n_fft)))
100
+ forward_basis = fourier_basis[:freq_cutoff].permute(2, 0, 1).reshape(-1, 1, fourier_basis.shape[1])
101
+ forward_basis = forward_basis * torch.as_tensor(librosa.util.pad_center(torch.hann_window(win_size), size=n_fft)).float()
102
+
103
+ import torch.nn.functional as F
104
+
105
+ # if center:
106
+ # signal = F.pad(y[:, None, None, :], (n_fft // 2, n_fft // 2, 0, 0), mode = 'reflect').squeeze(1)
107
+ assert center is False
108
+
109
+ forward_transform_squared = F.conv1d(y, forward_basis.to(y.device), stride = hop_size)
110
+ spec2 = torch.stack([forward_transform_squared[:, :freq_cutoff, :], forward_transform_squared[:, freq_cutoff:, :]], dim = -1)
111
+
112
+
113
+ # ******************** Verification ************************#
114
+ spec1 = torch.stft(y.squeeze(1), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
115
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
116
+ assert torch.allclose(spec1, spec2, atol=1e-4)
117
+
118
+ spec = torch.sqrt(spec2.pow(2).sum(-1) + 1e-6)
119
+ return spec
120
+
121
+
122
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
123
+ global mel_basis
124
+ dtype_device = str(spec.dtype) + "_" + str(spec.device)
125
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
126
+ if fmax_dtype_device not in mel_basis:
127
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
128
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
129
+ dtype=spec.dtype, device=spec.device
130
+ )
131
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
132
+ spec = spectral_normalize_torch(spec)
133
+ return spec
134
+
135
+
136
+ def mel_spectrogram_torch(
137
+ y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
138
+ ):
139
+ if torch.min(y) < -1.0:
140
+ print("min value is ", torch.min(y))
141
+ if torch.max(y) > 1.0:
142
+ print("max value is ", torch.max(y))
143
+
144
+ global mel_basis, hann_window
145
+ dtype_device = str(y.dtype) + "_" + str(y.device)
146
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
147
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
148
+ if fmax_dtype_device not in mel_basis:
149
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
150
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
151
+ dtype=y.dtype, device=y.device
152
+ )
153
+ if wnsize_dtype_device not in hann_window:
154
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
155
+ dtype=y.dtype, device=y.device
156
+ )
157
+
158
+ y = torch.nn.functional.pad(
159
+ y.unsqueeze(1),
160
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
161
+ mode="reflect",
162
+ )
163
+ y = y.squeeze(1)
164
+
165
+ spec = torch.stft(
166
+ y,
167
+ n_fft,
168
+ hop_length=hop_size,
169
+ win_length=win_size,
170
+ window=hann_window[wnsize_dtype_device],
171
+ center=center,
172
+ pad_mode="reflect",
173
+ normalized=False,
174
+ onesided=True,
175
+ return_complex=False,
176
+ )
177
+
178
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
179
+
180
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
181
+ spec = spectral_normalize_torch(spec)
182
+
183
+ return spec
openvoice/models.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from openvoice import commons
7
+ from openvoice import modules
8
+ from openvoice import attentions
9
+
10
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+
13
+ from openvoice.commons import init_weights, get_padding
14
+
15
+
16
+ class TextEncoder(nn.Module):
17
+ def __init__(self,
18
+ n_vocab,
19
+ out_channels,
20
+ hidden_channels,
21
+ filter_channels,
22
+ n_heads,
23
+ n_layers,
24
+ kernel_size,
25
+ p_dropout):
26
+ super().__init__()
27
+ self.n_vocab = n_vocab
28
+ self.out_channels = out_channels
29
+ self.hidden_channels = hidden_channels
30
+ self.filter_channels = filter_channels
31
+ self.n_heads = n_heads
32
+ self.n_layers = n_layers
33
+ self.kernel_size = kernel_size
34
+ self.p_dropout = p_dropout
35
+
36
+ self.emb = nn.Embedding(n_vocab, hidden_channels)
37
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
38
+
39
+ self.encoder = attentions.Encoder(
40
+ hidden_channels,
41
+ filter_channels,
42
+ n_heads,
43
+ n_layers,
44
+ kernel_size,
45
+ p_dropout)
46
+ self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
47
+
48
+ def forward(self, x, x_lengths):
49
+ x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
50
+ x = torch.transpose(x, 1, -1) # [b, h, t]
51
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
52
+
53
+ x = self.encoder(x * x_mask, x_mask)
54
+ stats = self.proj(x) * x_mask
55
+
56
+ m, logs = torch.split(stats, self.out_channels, dim=1)
57
+ return x, m, logs, x_mask
58
+
59
+
60
+ class DurationPredictor(nn.Module):
61
+ def __init__(
62
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
63
+ ):
64
+ super().__init__()
65
+
66
+ self.in_channels = in_channels
67
+ self.filter_channels = filter_channels
68
+ self.kernel_size = kernel_size
69
+ self.p_dropout = p_dropout
70
+ self.gin_channels = gin_channels
71
+
72
+ self.drop = nn.Dropout(p_dropout)
73
+ self.conv_1 = nn.Conv1d(
74
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
75
+ )
76
+ self.norm_1 = modules.LayerNorm(filter_channels)
77
+ self.conv_2 = nn.Conv1d(
78
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
79
+ )
80
+ self.norm_2 = modules.LayerNorm(filter_channels)
81
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
82
+
83
+ if gin_channels != 0:
84
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
85
+
86
+ def forward(self, x, x_mask, g=None):
87
+ x = torch.detach(x)
88
+ if g is not None:
89
+ g = torch.detach(g)
90
+ x = x + self.cond(g)
91
+ x = self.conv_1(x * x_mask)
92
+ x = torch.relu(x)
93
+ x = self.norm_1(x)
94
+ x = self.drop(x)
95
+ x = self.conv_2(x * x_mask)
96
+ x = torch.relu(x)
97
+ x = self.norm_2(x)
98
+ x = self.drop(x)
99
+ x = self.proj(x * x_mask)
100
+ return x * x_mask
101
+
102
+ class StochasticDurationPredictor(nn.Module):
103
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
104
+ super().__init__()
105
+ filter_channels = in_channels # it needs to be removed from future version.
106
+ self.in_channels = in_channels
107
+ self.filter_channels = filter_channels
108
+ self.kernel_size = kernel_size
109
+ self.p_dropout = p_dropout
110
+ self.n_flows = n_flows
111
+ self.gin_channels = gin_channels
112
+
113
+ self.log_flow = modules.Log()
114
+ self.flows = nn.ModuleList()
115
+ self.flows.append(modules.ElementwiseAffine(2))
116
+ for i in range(n_flows):
117
+ self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
118
+ self.flows.append(modules.Flip())
119
+
120
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
121
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
122
+ self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
123
+ self.post_flows = nn.ModuleList()
124
+ self.post_flows.append(modules.ElementwiseAffine(2))
125
+ for i in range(4):
126
+ self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
127
+ self.post_flows.append(modules.Flip())
128
+
129
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
130
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
131
+ self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
132
+ if gin_channels != 0:
133
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
134
+
135
+ def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
136
+ x = torch.detach(x)
137
+ x = self.pre(x)
138
+ if g is not None:
139
+ g = torch.detach(g)
140
+ x = x + self.cond(g)
141
+ x = self.convs(x, x_mask)
142
+ x = self.proj(x) * x_mask
143
+
144
+ if not reverse:
145
+ flows = self.flows
146
+ assert w is not None
147
+
148
+ logdet_tot_q = 0
149
+ h_w = self.post_pre(w)
150
+ h_w = self.post_convs(h_w, x_mask)
151
+ h_w = self.post_proj(h_w) * x_mask
152
+ e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
153
+ z_q = e_q
154
+ for flow in self.post_flows:
155
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
156
+ logdet_tot_q += logdet_q
157
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
158
+ u = torch.sigmoid(z_u) * x_mask
159
+ z0 = (w - u) * x_mask
160
+ logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
161
+ logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
162
+
163
+ logdet_tot = 0
164
+ z0, logdet = self.log_flow(z0, x_mask)
165
+ logdet_tot += logdet
166
+ z = torch.cat([z0, z1], 1)
167
+ for flow in flows:
168
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
169
+ logdet_tot = logdet_tot + logdet
170
+ nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
171
+ return nll + logq # [b]
172
+ else:
173
+ flows = list(reversed(self.flows))
174
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
175
+ z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
176
+ for flow in flows:
177
+ z = flow(z, x_mask, g=x, reverse=reverse)
178
+ z0, z1 = torch.split(z, [1, 1], 1)
179
+ logw = z0
180
+ return logw
181
+
182
+ class PosteriorEncoder(nn.Module):
183
+ def __init__(
184
+ self,
185
+ in_channels,
186
+ out_channels,
187
+ hidden_channels,
188
+ kernel_size,
189
+ dilation_rate,
190
+ n_layers,
191
+ gin_channels=0,
192
+ ):
193
+ super().__init__()
194
+ self.in_channels = in_channels
195
+ self.out_channels = out_channels
196
+ self.hidden_channels = hidden_channels
197
+ self.kernel_size = kernel_size
198
+ self.dilation_rate = dilation_rate
199
+ self.n_layers = n_layers
200
+ self.gin_channels = gin_channels
201
+
202
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
203
+ self.enc = modules.WN(
204
+ hidden_channels,
205
+ kernel_size,
206
+ dilation_rate,
207
+ n_layers,
208
+ gin_channels=gin_channels,
209
+ )
210
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
211
+
212
+ def forward(self, x, x_lengths, g=None, tau=1.0):
213
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
214
+ x.dtype
215
+ )
216
+ x = self.pre(x) * x_mask
217
+ x = self.enc(x, x_mask, g=g)
218
+ stats = self.proj(x) * x_mask
219
+ m, logs = torch.split(stats, self.out_channels, dim=1)
220
+ z = (m + torch.randn_like(m) * tau * torch.exp(logs)) * x_mask
221
+ return z, m, logs, x_mask
222
+
223
+
224
+ class Generator(torch.nn.Module):
225
+ def __init__(
226
+ self,
227
+ initial_channel,
228
+ resblock,
229
+ resblock_kernel_sizes,
230
+ resblock_dilation_sizes,
231
+ upsample_rates,
232
+ upsample_initial_channel,
233
+ upsample_kernel_sizes,
234
+ gin_channels=0,
235
+ ):
236
+ super(Generator, self).__init__()
237
+ self.num_kernels = len(resblock_kernel_sizes)
238
+ self.num_upsamples = len(upsample_rates)
239
+ self.conv_pre = Conv1d(
240
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
241
+ )
242
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
243
+
244
+ self.ups = nn.ModuleList()
245
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
246
+ self.ups.append(
247
+ weight_norm(
248
+ ConvTranspose1d(
249
+ upsample_initial_channel // (2**i),
250
+ upsample_initial_channel // (2 ** (i + 1)),
251
+ k,
252
+ u,
253
+ padding=(k - u) // 2,
254
+ )
255
+ )
256
+ )
257
+
258
+ self.resblocks = nn.ModuleList()
259
+ for i in range(len(self.ups)):
260
+ ch = upsample_initial_channel // (2 ** (i + 1))
261
+ for j, (k, d) in enumerate(
262
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
263
+ ):
264
+ self.resblocks.append(resblock(ch, k, d))
265
+
266
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
267
+ self.ups.apply(init_weights)
268
+
269
+ if gin_channels != 0:
270
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
271
+
272
+ def forward(self, x, g=None):
273
+ x = self.conv_pre(x)
274
+ if g is not None:
275
+ x = x + self.cond(g)
276
+
277
+ for i in range(self.num_upsamples):
278
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
279
+ x = self.ups[i](x)
280
+ xs = None
281
+ for j in range(self.num_kernels):
282
+ if xs is None:
283
+ xs = self.resblocks[i * self.num_kernels + j](x)
284
+ else:
285
+ xs += self.resblocks[i * self.num_kernels + j](x)
286
+ x = xs / self.num_kernels
287
+ x = F.leaky_relu(x)
288
+ x = self.conv_post(x)
289
+ x = torch.tanh(x)
290
+
291
+ return x
292
+
293
+ def remove_weight_norm(self):
294
+ print("Removing weight norm...")
295
+ for layer in self.ups:
296
+ remove_weight_norm(layer)
297
+ for layer in self.resblocks:
298
+ layer.remove_weight_norm()
299
+
300
+
301
+ class ReferenceEncoder(nn.Module):
302
+ """
303
+ inputs --- [N, Ty/r, n_mels*r] mels
304
+ outputs --- [N, ref_enc_gru_size]
305
+ """
306
+
307
+ def __init__(self, spec_channels, gin_channels=0, layernorm=True):
308
+ super().__init__()
309
+ self.spec_channels = spec_channels
310
+ ref_enc_filters = [32, 32, 64, 64, 128, 128]
311
+ K = len(ref_enc_filters)
312
+ filters = [1] + ref_enc_filters
313
+ convs = [
314
+ weight_norm(
315
+ nn.Conv2d(
316
+ in_channels=filters[i],
317
+ out_channels=filters[i + 1],
318
+ kernel_size=(3, 3),
319
+ stride=(2, 2),
320
+ padding=(1, 1),
321
+ )
322
+ )
323
+ for i in range(K)
324
+ ]
325
+ self.convs = nn.ModuleList(convs)
326
+
327
+ out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
328
+ self.gru = nn.GRU(
329
+ input_size=ref_enc_filters[-1] * out_channels,
330
+ hidden_size=256 // 2,
331
+ batch_first=True,
332
+ )
333
+ self.proj = nn.Linear(128, gin_channels)
334
+ if layernorm:
335
+ self.layernorm = nn.LayerNorm(self.spec_channels)
336
+ else:
337
+ self.layernorm = None
338
+
339
+ def forward(self, inputs, mask=None):
340
+ N = inputs.size(0)
341
+
342
+ out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
343
+ if self.layernorm is not None:
344
+ out = self.layernorm(out)
345
+
346
+ for conv in self.convs:
347
+ out = conv(out)
348
+ # out = wn(out)
349
+ out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
350
+
351
+ out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
352
+ T = out.size(1)
353
+ N = out.size(0)
354
+ out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
355
+
356
+ self.gru.flatten_parameters()
357
+ memory, out = self.gru(out) # out --- [1, N, 128]
358
+
359
+ return self.proj(out.squeeze(0))
360
+
361
+ def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
362
+ for i in range(n_convs):
363
+ L = (L - kernel_size + 2 * pad) // stride + 1
364
+ return L
365
+
366
+
367
+ class ResidualCouplingBlock(nn.Module):
368
+ def __init__(self,
369
+ channels,
370
+ hidden_channels,
371
+ kernel_size,
372
+ dilation_rate,
373
+ n_layers,
374
+ n_flows=4,
375
+ gin_channels=0):
376
+ super().__init__()
377
+ self.channels = channels
378
+ self.hidden_channels = hidden_channels
379
+ self.kernel_size = kernel_size
380
+ self.dilation_rate = dilation_rate
381
+ self.n_layers = n_layers
382
+ self.n_flows = n_flows
383
+ self.gin_channels = gin_channels
384
+
385
+ self.flows = nn.ModuleList()
386
+ for i in range(n_flows):
387
+ self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
388
+ self.flows.append(modules.Flip())
389
+
390
+ def forward(self, x, x_mask, g=None, reverse=False):
391
+ if not reverse:
392
+ for flow in self.flows:
393
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
394
+ else:
395
+ for flow in reversed(self.flows):
396
+ x = flow(x, x_mask, g=g, reverse=reverse)
397
+ return x
398
+
399
+ class SynthesizerTrn(nn.Module):
400
+ """
401
+ Synthesizer for Training
402
+ """
403
+
404
+ def __init__(
405
+ self,
406
+ n_vocab,
407
+ spec_channels,
408
+ inter_channels,
409
+ hidden_channels,
410
+ filter_channels,
411
+ n_heads,
412
+ n_layers,
413
+ kernel_size,
414
+ p_dropout,
415
+ resblock,
416
+ resblock_kernel_sizes,
417
+ resblock_dilation_sizes,
418
+ upsample_rates,
419
+ upsample_initial_channel,
420
+ upsample_kernel_sizes,
421
+ n_speakers=256,
422
+ gin_channels=256,
423
+ zero_g=False,
424
+ **kwargs
425
+ ):
426
+ super().__init__()
427
+
428
+ self.dec = Generator(
429
+ inter_channels,
430
+ resblock,
431
+ resblock_kernel_sizes,
432
+ resblock_dilation_sizes,
433
+ upsample_rates,
434
+ upsample_initial_channel,
435
+ upsample_kernel_sizes,
436
+ gin_channels=gin_channels,
437
+ )
438
+ self.enc_q = PosteriorEncoder(
439
+ spec_channels,
440
+ inter_channels,
441
+ hidden_channels,
442
+ 5,
443
+ 1,
444
+ 16,
445
+ gin_channels=gin_channels,
446
+ )
447
+
448
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
449
+
450
+ self.n_speakers = n_speakers
451
+ if n_speakers == 0:
452
+ self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
453
+ else:
454
+ self.enc_p = TextEncoder(n_vocab,
455
+ inter_channels,
456
+ hidden_channels,
457
+ filter_channels,
458
+ n_heads,
459
+ n_layers,
460
+ kernel_size,
461
+ p_dropout)
462
+ self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
463
+ self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
464
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
465
+ self.zero_g = zero_g
466
+
467
+ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., sdp_ratio=0.2, max_len=None):
468
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
469
+ if self.n_speakers > 0:
470
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
471
+ else:
472
+ g = None
473
+
474
+ logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * sdp_ratio \
475
+ + self.dp(x, x_mask, g=g) * (1 - sdp_ratio)
476
+
477
+ w = torch.exp(logw) * x_mask * length_scale
478
+ w_ceil = torch.ceil(w)
479
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
480
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
481
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
482
+ attn = commons.generate_path(w_ceil, attn_mask)
483
+
484
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
485
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
486
+
487
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
488
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
489
+ o = self.dec((z * y_mask)[:,:,:max_len], g=g)
490
+ return o, attn, y_mask, (z, z_p, m_p, logs_p)
491
+
492
+ def voice_conversion(self, y, y_lengths, sid_src, sid_tgt, tau=1.0):
493
+ g_src = sid_src
494
+ g_tgt = sid_tgt
495
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src if not self.zero_g else torch.zeros_like(g_src), tau=tau)
496
+ z_p = self.flow(z, y_mask, g=g_src)
497
+ z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
498
+ o_hat = self.dec(z_hat * y_mask, g=g_tgt if not self.zero_g else torch.zeros_like(g_tgt))
499
+ return o_hat, y_mask, (z, z_p, z_hat)
openvoice/modules.py ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from torch.nn import Conv1d
7
+ from torch.nn.utils import weight_norm, remove_weight_norm
8
+
9
+ from openvoice import commons
10
+ from openvoice.commons import init_weights, get_padding
11
+ from openvoice.transforms import piecewise_rational_quadratic_transform
12
+ from openvoice.attentions import Encoder
13
+
14
+ LRELU_SLOPE = 0.1
15
+
16
+
17
+ class LayerNorm(nn.Module):
18
+ def __init__(self, channels, eps=1e-5):
19
+ super().__init__()
20
+ self.channels = channels
21
+ self.eps = eps
22
+
23
+ self.gamma = nn.Parameter(torch.ones(channels))
24
+ self.beta = nn.Parameter(torch.zeros(channels))
25
+
26
+ def forward(self, x):
27
+ x = x.transpose(1, -1)
28
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
29
+ return x.transpose(1, -1)
30
+
31
+
32
+ class ConvReluNorm(nn.Module):
33
+ def __init__(
34
+ self,
35
+ in_channels,
36
+ hidden_channels,
37
+ out_channels,
38
+ kernel_size,
39
+ n_layers,
40
+ p_dropout,
41
+ ):
42
+ super().__init__()
43
+ self.in_channels = in_channels
44
+ self.hidden_channels = hidden_channels
45
+ self.out_channels = out_channels
46
+ self.kernel_size = kernel_size
47
+ self.n_layers = n_layers
48
+ self.p_dropout = p_dropout
49
+ assert n_layers > 1, "Number of layers should be larger than 0."
50
+
51
+ self.conv_layers = nn.ModuleList()
52
+ self.norm_layers = nn.ModuleList()
53
+ self.conv_layers.append(
54
+ nn.Conv1d(
55
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
56
+ )
57
+ )
58
+ self.norm_layers.append(LayerNorm(hidden_channels))
59
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
60
+ for _ in range(n_layers - 1):
61
+ self.conv_layers.append(
62
+ nn.Conv1d(
63
+ hidden_channels,
64
+ hidden_channels,
65
+ kernel_size,
66
+ padding=kernel_size // 2,
67
+ )
68
+ )
69
+ self.norm_layers.append(LayerNorm(hidden_channels))
70
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
71
+ self.proj.weight.data.zero_()
72
+ self.proj.bias.data.zero_()
73
+
74
+ def forward(self, x, x_mask):
75
+ x_org = x
76
+ for i in range(self.n_layers):
77
+ x = self.conv_layers[i](x * x_mask)
78
+ x = self.norm_layers[i](x)
79
+ x = self.relu_drop(x)
80
+ x = x_org + self.proj(x)
81
+ return x * x_mask
82
+
83
+
84
+ class DDSConv(nn.Module):
85
+ """
86
+ Dilated and Depth-Separable Convolution
87
+ """
88
+
89
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
90
+ super().__init__()
91
+ self.channels = channels
92
+ self.kernel_size = kernel_size
93
+ self.n_layers = n_layers
94
+ self.p_dropout = p_dropout
95
+
96
+ self.drop = nn.Dropout(p_dropout)
97
+ self.convs_sep = nn.ModuleList()
98
+ self.convs_1x1 = nn.ModuleList()
99
+ self.norms_1 = nn.ModuleList()
100
+ self.norms_2 = nn.ModuleList()
101
+ for i in range(n_layers):
102
+ dilation = kernel_size**i
103
+ padding = (kernel_size * dilation - dilation) // 2
104
+ self.convs_sep.append(
105
+ nn.Conv1d(
106
+ channels,
107
+ channels,
108
+ kernel_size,
109
+ groups=channels,
110
+ dilation=dilation,
111
+ padding=padding,
112
+ )
113
+ )
114
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
115
+ self.norms_1.append(LayerNorm(channels))
116
+ self.norms_2.append(LayerNorm(channels))
117
+
118
+ def forward(self, x, x_mask, g=None):
119
+ if g is not None:
120
+ x = x + g
121
+ for i in range(self.n_layers):
122
+ y = self.convs_sep[i](x * x_mask)
123
+ y = self.norms_1[i](y)
124
+ y = F.gelu(y)
125
+ y = self.convs_1x1[i](y)
126
+ y = self.norms_2[i](y)
127
+ y = F.gelu(y)
128
+ y = self.drop(y)
129
+ x = x + y
130
+ return x * x_mask
131
+
132
+
133
+ class WN(torch.nn.Module):
134
+ def __init__(
135
+ self,
136
+ hidden_channels,
137
+ kernel_size,
138
+ dilation_rate,
139
+ n_layers,
140
+ gin_channels=0,
141
+ p_dropout=0,
142
+ ):
143
+ super(WN, self).__init__()
144
+ assert kernel_size % 2 == 1
145
+ self.hidden_channels = hidden_channels
146
+ self.kernel_size = (kernel_size,)
147
+ self.dilation_rate = dilation_rate
148
+ self.n_layers = n_layers
149
+ self.gin_channels = gin_channels
150
+ self.p_dropout = p_dropout
151
+
152
+ self.in_layers = torch.nn.ModuleList()
153
+ self.res_skip_layers = torch.nn.ModuleList()
154
+ self.drop = nn.Dropout(p_dropout)
155
+
156
+ if gin_channels != 0:
157
+ cond_layer = torch.nn.Conv1d(
158
+ gin_channels, 2 * hidden_channels * n_layers, 1
159
+ )
160
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
161
+
162
+ for i in range(n_layers):
163
+ dilation = dilation_rate**i
164
+ padding = int((kernel_size * dilation - dilation) / 2)
165
+ in_layer = torch.nn.Conv1d(
166
+ hidden_channels,
167
+ 2 * hidden_channels,
168
+ kernel_size,
169
+ dilation=dilation,
170
+ padding=padding,
171
+ )
172
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
173
+ self.in_layers.append(in_layer)
174
+
175
+ # last one is not necessary
176
+ if i < n_layers - 1:
177
+ res_skip_channels = 2 * hidden_channels
178
+ else:
179
+ res_skip_channels = hidden_channels
180
+
181
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
182
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
183
+ self.res_skip_layers.append(res_skip_layer)
184
+
185
+ def forward(self, x, x_mask, g=None, **kwargs):
186
+ output = torch.zeros_like(x)
187
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
188
+
189
+ if g is not None:
190
+ g = self.cond_layer(g)
191
+
192
+ for i in range(self.n_layers):
193
+ x_in = self.in_layers[i](x)
194
+ if g is not None:
195
+ cond_offset = i * 2 * self.hidden_channels
196
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
197
+ else:
198
+ g_l = torch.zeros_like(x_in)
199
+
200
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
201
+ acts = self.drop(acts)
202
+
203
+ res_skip_acts = self.res_skip_layers[i](acts)
204
+ if i < self.n_layers - 1:
205
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
206
+ x = (x + res_acts) * x_mask
207
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
208
+ else:
209
+ output = output + res_skip_acts
210
+ return output * x_mask
211
+
212
+ def remove_weight_norm(self):
213
+ if self.gin_channels != 0:
214
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
215
+ for l in self.in_layers:
216
+ torch.nn.utils.remove_weight_norm(l)
217
+ for l in self.res_skip_layers:
218
+ torch.nn.utils.remove_weight_norm(l)
219
+
220
+
221
+ class ResBlock1(torch.nn.Module):
222
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
223
+ super(ResBlock1, self).__init__()
224
+ self.convs1 = nn.ModuleList(
225
+ [
226
+ weight_norm(
227
+ Conv1d(
228
+ channels,
229
+ channels,
230
+ kernel_size,
231
+ 1,
232
+ dilation=dilation[0],
233
+ padding=get_padding(kernel_size, dilation[0]),
234
+ )
235
+ ),
236
+ weight_norm(
237
+ Conv1d(
238
+ channels,
239
+ channels,
240
+ kernel_size,
241
+ 1,
242
+ dilation=dilation[1],
243
+ padding=get_padding(kernel_size, dilation[1]),
244
+ )
245
+ ),
246
+ weight_norm(
247
+ Conv1d(
248
+ channels,
249
+ channels,
250
+ kernel_size,
251
+ 1,
252
+ dilation=dilation[2],
253
+ padding=get_padding(kernel_size, dilation[2]),
254
+ )
255
+ ),
256
+ ]
257
+ )
258
+ self.convs1.apply(init_weights)
259
+
260
+ self.convs2 = nn.ModuleList(
261
+ [
262
+ weight_norm(
263
+ Conv1d(
264
+ channels,
265
+ channels,
266
+ kernel_size,
267
+ 1,
268
+ dilation=1,
269
+ padding=get_padding(kernel_size, 1),
270
+ )
271
+ ),
272
+ weight_norm(
273
+ Conv1d(
274
+ channels,
275
+ channels,
276
+ kernel_size,
277
+ 1,
278
+ dilation=1,
279
+ padding=get_padding(kernel_size, 1),
280
+ )
281
+ ),
282
+ weight_norm(
283
+ Conv1d(
284
+ channels,
285
+ channels,
286
+ kernel_size,
287
+ 1,
288
+ dilation=1,
289
+ padding=get_padding(kernel_size, 1),
290
+ )
291
+ ),
292
+ ]
293
+ )
294
+ self.convs2.apply(init_weights)
295
+
296
+ def forward(self, x, x_mask=None):
297
+ for c1, c2 in zip(self.convs1, self.convs2):
298
+ xt = F.leaky_relu(x, LRELU_SLOPE)
299
+ if x_mask is not None:
300
+ xt = xt * x_mask
301
+ xt = c1(xt)
302
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
303
+ if x_mask is not None:
304
+ xt = xt * x_mask
305
+ xt = c2(xt)
306
+ x = xt + x
307
+ if x_mask is not None:
308
+ x = x * x_mask
309
+ return x
310
+
311
+ def remove_weight_norm(self):
312
+ for l in self.convs1:
313
+ remove_weight_norm(l)
314
+ for l in self.convs2:
315
+ remove_weight_norm(l)
316
+
317
+
318
+ class ResBlock2(torch.nn.Module):
319
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
320
+ super(ResBlock2, self).__init__()
321
+ self.convs = nn.ModuleList(
322
+ [
323
+ weight_norm(
324
+ Conv1d(
325
+ channels,
326
+ channels,
327
+ kernel_size,
328
+ 1,
329
+ dilation=dilation[0],
330
+ padding=get_padding(kernel_size, dilation[0]),
331
+ )
332
+ ),
333
+ weight_norm(
334
+ Conv1d(
335
+ channels,
336
+ channels,
337
+ kernel_size,
338
+ 1,
339
+ dilation=dilation[1],
340
+ padding=get_padding(kernel_size, dilation[1]),
341
+ )
342
+ ),
343
+ ]
344
+ )
345
+ self.convs.apply(init_weights)
346
+
347
+ def forward(self, x, x_mask=None):
348
+ for c in self.convs:
349
+ xt = F.leaky_relu(x, LRELU_SLOPE)
350
+ if x_mask is not None:
351
+ xt = xt * x_mask
352
+ xt = c(xt)
353
+ x = xt + x
354
+ if x_mask is not None:
355
+ x = x * x_mask
356
+ return x
357
+
358
+ def remove_weight_norm(self):
359
+ for l in self.convs:
360
+ remove_weight_norm(l)
361
+
362
+
363
+ class Log(nn.Module):
364
+ def forward(self, x, x_mask, reverse=False, **kwargs):
365
+ if not reverse:
366
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
367
+ logdet = torch.sum(-y, [1, 2])
368
+ return y, logdet
369
+ else:
370
+ x = torch.exp(x) * x_mask
371
+ return x
372
+
373
+
374
+ class Flip(nn.Module):
375
+ def forward(self, x, *args, reverse=False, **kwargs):
376
+ x = torch.flip(x, [1])
377
+ if not reverse:
378
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
379
+ return x, logdet
380
+ else:
381
+ return x
382
+
383
+
384
+ class ElementwiseAffine(nn.Module):
385
+ def __init__(self, channels):
386
+ super().__init__()
387
+ self.channels = channels
388
+ self.m = nn.Parameter(torch.zeros(channels, 1))
389
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
390
+
391
+ def forward(self, x, x_mask, reverse=False, **kwargs):
392
+ if not reverse:
393
+ y = self.m + torch.exp(self.logs) * x
394
+ y = y * x_mask
395
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
396
+ return y, logdet
397
+ else:
398
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
399
+ return x
400
+
401
+
402
+ class ResidualCouplingLayer(nn.Module):
403
+ def __init__(
404
+ self,
405
+ channels,
406
+ hidden_channels,
407
+ kernel_size,
408
+ dilation_rate,
409
+ n_layers,
410
+ p_dropout=0,
411
+ gin_channels=0,
412
+ mean_only=False,
413
+ ):
414
+ assert channels % 2 == 0, "channels should be divisible by 2"
415
+ super().__init__()
416
+ self.channels = channels
417
+ self.hidden_channels = hidden_channels
418
+ self.kernel_size = kernel_size
419
+ self.dilation_rate = dilation_rate
420
+ self.n_layers = n_layers
421
+ self.half_channels = channels // 2
422
+ self.mean_only = mean_only
423
+
424
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
425
+ self.enc = WN(
426
+ hidden_channels,
427
+ kernel_size,
428
+ dilation_rate,
429
+ n_layers,
430
+ p_dropout=p_dropout,
431
+ gin_channels=gin_channels,
432
+ )
433
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
434
+ self.post.weight.data.zero_()
435
+ self.post.bias.data.zero_()
436
+
437
+ def forward(self, x, x_mask, g=None, reverse=False):
438
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
439
+ h = self.pre(x0) * x_mask
440
+ h = self.enc(h, x_mask, g=g)
441
+ stats = self.post(h) * x_mask
442
+ if not self.mean_only:
443
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
444
+ else:
445
+ m = stats
446
+ logs = torch.zeros_like(m)
447
+
448
+ if not reverse:
449
+ x1 = m + x1 * torch.exp(logs) * x_mask
450
+ x = torch.cat([x0, x1], 1)
451
+ logdet = torch.sum(logs, [1, 2])
452
+ return x, logdet
453
+ else:
454
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
455
+ x = torch.cat([x0, x1], 1)
456
+ return x
457
+
458
+
459
+ class ConvFlow(nn.Module):
460
+ def __init__(
461
+ self,
462
+ in_channels,
463
+ filter_channels,
464
+ kernel_size,
465
+ n_layers,
466
+ num_bins=10,
467
+ tail_bound=5.0,
468
+ ):
469
+ super().__init__()
470
+ self.in_channels = in_channels
471
+ self.filter_channels = filter_channels
472
+ self.kernel_size = kernel_size
473
+ self.n_layers = n_layers
474
+ self.num_bins = num_bins
475
+ self.tail_bound = tail_bound
476
+ self.half_channels = in_channels // 2
477
+
478
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
479
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
480
+ self.proj = nn.Conv1d(
481
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
482
+ )
483
+ self.proj.weight.data.zero_()
484
+ self.proj.bias.data.zero_()
485
+
486
+ def forward(self, x, x_mask, g=None, reverse=False):
487
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
488
+ h = self.pre(x0)
489
+ h = self.convs(h, x_mask, g=g)
490
+ h = self.proj(h) * x_mask
491
+
492
+ b, c, t = x0.shape
493
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
494
+
495
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
496
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
497
+ self.filter_channels
498
+ )
499
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
500
+
501
+ x1, logabsdet = piecewise_rational_quadratic_transform(
502
+ x1,
503
+ unnormalized_widths,
504
+ unnormalized_heights,
505
+ unnormalized_derivatives,
506
+ inverse=reverse,
507
+ tails="linear",
508
+ tail_bound=self.tail_bound,
509
+ )
510
+
511
+ x = torch.cat([x0, x1], 1) * x_mask
512
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
513
+ if not reverse:
514
+ return x, logdet
515
+ else:
516
+ return x
517
+
518
+
519
+ class TransformerCouplingLayer(nn.Module):
520
+ def __init__(
521
+ self,
522
+ channels,
523
+ hidden_channels,
524
+ kernel_size,
525
+ n_layers,
526
+ n_heads,
527
+ p_dropout=0,
528
+ filter_channels=0,
529
+ mean_only=False,
530
+ wn_sharing_parameter=None,
531
+ gin_channels=0,
532
+ ):
533
+ assert n_layers == 3, n_layers
534
+ assert channels % 2 == 0, "channels should be divisible by 2"
535
+ super().__init__()
536
+ self.channels = channels
537
+ self.hidden_channels = hidden_channels
538
+ self.kernel_size = kernel_size
539
+ self.n_layers = n_layers
540
+ self.half_channels = channels // 2
541
+ self.mean_only = mean_only
542
+
543
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
544
+ self.enc = (
545
+ Encoder(
546
+ hidden_channels,
547
+ filter_channels,
548
+ n_heads,
549
+ n_layers,
550
+ kernel_size,
551
+ p_dropout,
552
+ isflow=True,
553
+ gin_channels=gin_channels,
554
+ )
555
+ if wn_sharing_parameter is None
556
+ else wn_sharing_parameter
557
+ )
558
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
559
+ self.post.weight.data.zero_()
560
+ self.post.bias.data.zero_()
561
+
562
+ def forward(self, x, x_mask, g=None, reverse=False):
563
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
564
+ h = self.pre(x0) * x_mask
565
+ h = self.enc(h, x_mask, g=g)
566
+ stats = self.post(h) * x_mask
567
+ if not self.mean_only:
568
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
569
+ else:
570
+ m = stats
571
+ logs = torch.zeros_like(m)
572
+
573
+ if not reverse:
574
+ x1 = m + x1 * torch.exp(logs) * x_mask
575
+ x = torch.cat([x0, x1], 1)
576
+ logdet = torch.sum(logs, [1, 2])
577
+ return x, logdet
578
+ else:
579
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
580
+ x = torch.cat([x0, x1], 1)
581
+ return x
582
+
583
+ x1, logabsdet = piecewise_rational_quadratic_transform(
584
+ x1,
585
+ unnormalized_widths,
586
+ unnormalized_heights,
587
+ unnormalized_derivatives,
588
+ inverse=reverse,
589
+ tails="linear",
590
+ tail_bound=self.tail_bound,
591
+ )
592
+
593
+ x = torch.cat([x0, x1], 1) * x_mask
594
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
595
+ if not reverse:
596
+ return x, logdet
597
+ else:
598
+ return x
openvoice/openvoice_app.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import argparse
4
+ import gradio as gr
5
+ from zipfile import ZipFile
6
+ import langid
7
+ from openvoice import se_extractor
8
+ from openvoice.api import BaseSpeakerTTS, ToneColorConverter
9
+
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument("--share", action='store_true', default=False, help="make link public")
12
+ args = parser.parse_args()
13
+
14
+ en_ckpt_base = 'checkpoints/base_speakers/EN'
15
+ zh_ckpt_base = 'checkpoints/base_speakers/ZH'
16
+ ckpt_converter = 'checkpoints/converter'
17
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
+ output_dir = 'outputs'
19
+ os.makedirs(output_dir, exist_ok=True)
20
+
21
+ # load models
22
+ en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
23
+ en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
24
+ zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
25
+ zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
26
+ tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
27
+ tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
28
+
29
+ # load speaker embeddings
30
+ en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
31
+ en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
32
+ zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
33
+
34
+ # This online demo mainly supports English and Chinese
35
+ supported_languages = ['zh', 'en']
36
+
37
+ def predict(prompt, style, audio_file_pth, agree):
38
+ # initialize a empty info
39
+ text_hint = ''
40
+ # agree with the terms
41
+ if agree == False:
42
+ text_hint += '[ERROR] Please accept the Terms & Condition!\n'
43
+ gr.Warning("Please accept the Terms & Condition!")
44
+ return (
45
+ text_hint,
46
+ None,
47
+ None,
48
+ )
49
+
50
+ # first detect the input language
51
+ language_predicted = langid.classify(prompt)[0].strip()
52
+ print(f"Detected language:{language_predicted}")
53
+
54
+ if language_predicted not in supported_languages:
55
+ text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
56
+ gr.Warning(
57
+ f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
58
+ )
59
+
60
+ return (
61
+ text_hint,
62
+ None,
63
+ None,
64
+ )
65
+
66
+ if language_predicted == "zh":
67
+ tts_model = zh_base_speaker_tts
68
+ source_se = zh_source_se
69
+ language = 'Chinese'
70
+ if style not in ['default']:
71
+ text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
72
+ gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
73
+ return (
74
+ text_hint,
75
+ None,
76
+ None,
77
+ )
78
+
79
+ else:
80
+ tts_model = en_base_speaker_tts
81
+ if style == 'default':
82
+ source_se = en_source_default_se
83
+ else:
84
+ source_se = en_source_style_se
85
+ language = 'English'
86
+ if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
87
+ text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
88
+ gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
89
+ return (
90
+ text_hint,
91
+ None,
92
+ None,
93
+ )
94
+
95
+ speaker_wav = audio_file_pth
96
+
97
+ if len(prompt) < 2:
98
+ text_hint += f"[ERROR] Please give a longer prompt text \n"
99
+ gr.Warning("Please give a longer prompt text")
100
+ return (
101
+ text_hint,
102
+ None,
103
+ None,
104
+ )
105
+ if len(prompt) > 200:
106
+ text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
107
+ gr.Warning(
108
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
109
+ )
110
+ return (
111
+ text_hint,
112
+ None,
113
+ None,
114
+ )
115
+
116
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
117
+ try:
118
+ target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True)
119
+ except Exception as e:
120
+ text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
121
+ gr.Warning(
122
+ "[ERROR] Get target tone color error {str(e)} \n"
123
+ )
124
+ return (
125
+ text_hint,
126
+ None,
127
+ None,
128
+ )
129
+
130
+ src_path = f'{output_dir}/tmp.wav'
131
+ tts_model.tts(prompt, src_path, speaker=style, language=language)
132
+
133
+ save_path = f'{output_dir}/output.wav'
134
+ # Run the tone color converter
135
+ encode_message = "@MyShell"
136
+ tone_color_converter.convert(
137
+ audio_src_path=src_path,
138
+ src_se=source_se,
139
+ tgt_se=target_se,
140
+ output_path=save_path,
141
+ message=encode_message)
142
+
143
+ text_hint += f'''Get response successfully \n'''
144
+
145
+ return (
146
+ text_hint,
147
+ save_path,
148
+ speaker_wav,
149
+ )
150
+
151
+
152
+
153
+ title = "MyShell OpenVoice"
154
+
155
+ description = """
156
+ We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
157
+ """
158
+
159
+ markdown_table = """
160
+ <div align="center" style="margin-bottom: 10px;">
161
+
162
+ | | | |
163
+ | :-----------: | :-----------: | :-----------: |
164
+ | **OpenSource Repo** | **Project Page** | **Join the Community** |
165
+ | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
166
+
167
+ </div>
168
+ """
169
+
170
+ markdown_table_v2 = """
171
+ <div align="center" style="margin-bottom: 2px;">
172
+
173
+ | | | | |
174
+ | :-----------: | :-----------: | :-----------: | :-----------: |
175
+ | **OpenSource Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
176
+
177
+ | | |
178
+ | :-----------: | :-----------: |
179
+ **Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
180
+
181
+ </div>
182
+ """
183
+ content = """
184
+ <div>
185
+ <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
186
+ This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
187
+ </div>
188
+ """
189
+ wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
190
+
191
+
192
+ examples = [
193
+ [
194
+ "今天天气真好,我们一起出去吃饭吧。",
195
+ 'default',
196
+ "resources/demo_speaker1.mp3",
197
+ True,
198
+ ],[
199
+ "This audio is generated by open voice with a half-performance model.",
200
+ 'whispering',
201
+ "resources/demo_speaker2.mp3",
202
+ True,
203
+ ],
204
+ [
205
+ "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
206
+ 'sad',
207
+ "resources/demo_speaker0.mp3",
208
+ True,
209
+ ],
210
+ ]
211
+
212
+ with gr.Blocks(analytics_enabled=False) as demo:
213
+
214
+ with gr.Row():
215
+ with gr.Column():
216
+ with gr.Row():
217
+ gr.Markdown(
218
+ """
219
+ ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
220
+ """
221
+ )
222
+ with gr.Row():
223
+ gr.Markdown(markdown_table_v2)
224
+ with gr.Row():
225
+ gr.Markdown(description)
226
+ with gr.Column():
227
+ gr.Video('https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f', autoplay=True)
228
+
229
+ with gr.Row():
230
+ gr.HTML(wrapped_markdown_content)
231
+
232
+ with gr.Row():
233
+ with gr.Column():
234
+ input_text_gr = gr.Textbox(
235
+ label="Text Prompt",
236
+ info="One or two sentences at a time is better. Up to 200 text characters.",
237
+ value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
238
+ )
239
+ style_gr = gr.Dropdown(
240
+ label="Style",
241
+ info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
242
+ choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
243
+ max_choices=1,
244
+ value="default",
245
+ )
246
+ ref_gr = gr.Audio(
247
+ label="Reference Audio",
248
+ info="Click on the ✎ button to upload your own target speaker audio",
249
+ type="filepath",
250
+ value="resources/demo_speaker2.mp3",
251
+ )
252
+ tos_gr = gr.Checkbox(
253
+ label="Agree",
254
+ value=False,
255
+ info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
256
+ )
257
+
258
+ tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
259
+
260
+
261
+ with gr.Column():
262
+ out_text_gr = gr.Text(label="Info")
263
+ audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
264
+ ref_audio_gr = gr.Audio(label="Reference Audio Used")
265
+
266
+ gr.Examples(examples,
267
+ label="Examples",
268
+ inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
269
+ outputs=[out_text_gr, audio_gr, ref_audio_gr],
270
+ fn=predict,
271
+ cache_examples=False,)
272
+ tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
273
+
274
+ demo.queue()
275
+ demo.launch(debug=True, show_api=True, share=args.share)
openvoice/se_extractor.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import torch
4
+ import hashlib
5
+ import librosa
6
+ import base64
7
+ from glob import glob
8
+ import numpy as np
9
+ from pydub import AudioSegment
10
+ from faster_whisper import WhisperModel
11
+ import hashlib
12
+ import base64
13
+ import librosa
14
+ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
15
+
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ print(f"Using device: {device}")
18
+
19
+ model_size = "medium"
20
+ # Run on GPU with FP16
21
+ model = None
22
+ def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
23
+ global model
24
+ if model is None:
25
+ model = WhisperModel(model_size, device=device, compute_type="float16" if device == "cuda" else "float32")
26
+ audio = AudioSegment.from_file(audio_path)
27
+ max_len = len(audio)
28
+
29
+ target_folder = os.path.join(target_dir, audio_name)
30
+
31
+ segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
32
+ segments = list(segments)
33
+
34
+ # create directory
35
+ os.makedirs(target_folder, exist_ok=True)
36
+ wavs_folder = os.path.join(target_folder, 'wavs')
37
+ os.makedirs(wavs_folder, exist_ok=True)
38
+
39
+ # segments
40
+ s_ind = 0
41
+ start_time = None
42
+
43
+ for k, w in enumerate(segments):
44
+ # process with the time
45
+ if k == 0:
46
+ start_time = max(0, w.start)
47
+
48
+ end_time = w.end
49
+
50
+ # calculate confidence
51
+ if len(w.words) > 0:
52
+ confidence = sum([s.probability for s in w.words]) / len(w.words)
53
+ else:
54
+ confidence = 0.
55
+ # clean text
56
+ text = w.text.replace('...', '')
57
+
58
+ # left 0.08s for each audios
59
+ audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
60
+
61
+ # segment file name
62
+ fname = f"{audio_name}_seg{s_ind}.wav"
63
+
64
+ # filter out the segment shorter than 1.5s and longer than 20s
65
+ save = audio_seg.duration_seconds > 1.5 and \
66
+ audio_seg.duration_seconds < 20. and \
67
+ len(text) >= 2 and len(text) < 200
68
+
69
+ if save:
70
+ output_file = os.path.join(wavs_folder, fname)
71
+ audio_seg.export(output_file, format='wav')
72
+
73
+ if k < len(segments) - 1:
74
+ start_time = max(0, segments[k+1].start - 0.08)
75
+
76
+ s_ind = s_ind + 1
77
+ return wavs_folder
78
+
79
+
80
+ def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
81
+ SAMPLE_RATE = 16000
82
+ audio_vad = get_audio_tensor(audio_path)
83
+ segments = get_vad_segments(
84
+ audio_vad,
85
+ output_sample=True,
86
+ min_speech_duration=0.1,
87
+ min_silence_duration=1,
88
+ method="silero",
89
+ )
90
+ segments = [(seg["start"], seg["end"]) for seg in segments]
91
+ segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
92
+ print(segments)
93
+ audio_active = AudioSegment.silent(duration=0)
94
+ audio = AudioSegment.from_file(audio_path)
95
+
96
+ for start_time, end_time in segments:
97
+ audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
98
+
99
+ audio_dur = audio_active.duration_seconds
100
+ print(f'after vad: dur = {audio_dur}')
101
+ target_folder = os.path.join(target_dir, audio_name)
102
+ wavs_folder = os.path.join(target_folder, 'wavs')
103
+ os.makedirs(wavs_folder, exist_ok=True)
104
+ start_time = 0.
105
+ count = 0
106
+ num_splits = int(np.round(audio_dur / split_seconds))
107
+ assert num_splits > 0, 'input audio is too short'
108
+ interval = audio_dur / num_splits
109
+
110
+ for i in range(num_splits):
111
+ end_time = min(start_time + interval, audio_dur)
112
+ if i == num_splits - 1:
113
+ end_time = audio_dur
114
+ output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
115
+ audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
116
+ audio_seg.export(output_file, format='wav')
117
+ start_time = end_time
118
+ count += 1
119
+ return wavs_folder
120
+
121
+ def hash_numpy_array(audio_path):
122
+ array, _ = librosa.load(audio_path, sr=None, mono=True)
123
+ # Convert the array to bytes
124
+ array_bytes = array.tobytes()
125
+ # Calculate the hash of the array bytes
126
+ hash_object = hashlib.sha256(array_bytes)
127
+ hash_value = hash_object.digest()
128
+ # Convert the hash value to base64
129
+ base64_value = base64.b64encode(hash_value)
130
+ return base64_value.decode('utf-8')[:16].replace('/', '_^')
131
+
132
+ def get_se(audio_path, vc_model, target_dir='processed', vad=True):
133
+ device = vc_model.device
134
+ version = vc_model.version
135
+ print("OpenVoice version:", version)
136
+
137
+ audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
138
+ se_path = os.path.join(target_dir, audio_name, 'se.pth')
139
+
140
+ # if os.path.isfile(se_path):
141
+ # se = torch.load(se_path).to(device)
142
+ # return se, audio_name
143
+ # if os.path.isdir(audio_path):
144
+ # wavs_folder = audio_path
145
+
146
+ if vad:
147
+ wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
148
+ else:
149
+ wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
150
+
151
+ audio_segs = glob(f'{wavs_folder}/*.wav')
152
+ if len(audio_segs) == 0:
153
+ raise NotImplementedError('No audio segments found!')
154
+
155
+ return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
156
+
openvoice/text/__init__.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ from openvoice.text import cleaners
3
+ from openvoice.text.symbols import symbols
4
+
5
+
6
+ # Mappings from symbol to numeric ID and vice versa:
7
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
+
10
+
11
+ def text_to_sequence(text, symbols, cleaner_names):
12
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13
+ Args:
14
+ text: string to convert to a sequence
15
+ cleaner_names: names of the cleaner functions to run the text through
16
+ Returns:
17
+ List of integers corresponding to the symbols in the text
18
+ '''
19
+ sequence = []
20
+ symbol_to_id = {s: i for i, s in enumerate(symbols)}
21
+ clean_text = _clean_text(text, cleaner_names)
22
+ print(clean_text)
23
+ print(f" length:{len(clean_text)}")
24
+ for symbol in clean_text:
25
+ if symbol not in symbol_to_id.keys():
26
+ continue
27
+ symbol_id = symbol_to_id[symbol]
28
+ sequence += [symbol_id]
29
+ print(f" length:{len(sequence)}")
30
+ return sequence
31
+
32
+
33
+ def cleaned_text_to_sequence(cleaned_text, symbols):
34
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
35
+ Args:
36
+ text: string to convert to a sequence
37
+ Returns:
38
+ List of integers corresponding to the symbols in the text
39
+ '''
40
+ symbol_to_id = {s: i for i, s in enumerate(symbols)}
41
+ sequence = [symbol_to_id[symbol] for symbol in cleaned_text if symbol in symbol_to_id.keys()]
42
+ return sequence
43
+
44
+
45
+
46
+ from openvoice.text.symbols import language_tone_start_map
47
+ def cleaned_text_to_sequence_vits2(cleaned_text, tones, language, symbols, languages):
48
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
49
+ Args:
50
+ text: string to convert to a sequence
51
+ Returns:
52
+ List of integers corresponding to the symbols in the text
53
+ """
54
+ symbol_to_id = {s: i for i, s in enumerate(symbols)}
55
+ language_id_map = {s: i for i, s in enumerate(languages)}
56
+ phones = [symbol_to_id[symbol] for symbol in cleaned_text]
57
+ tone_start = language_tone_start_map[language]
58
+ tones = [i + tone_start for i in tones]
59
+ lang_id = language_id_map[language]
60
+ lang_ids = [lang_id for i in phones]
61
+ return phones, tones, lang_ids
62
+
63
+
64
+ def sequence_to_text(sequence):
65
+ '''Converts a sequence of IDs back to a string'''
66
+ result = ''
67
+ for symbol_id in sequence:
68
+ s = _id_to_symbol[symbol_id]
69
+ result += s
70
+ return result
71
+
72
+
73
+ def _clean_text(text, cleaner_names):
74
+ for name in cleaner_names:
75
+ cleaner = getattr(cleaners, name)
76
+ if not cleaner:
77
+ raise Exception('Unknown cleaner: %s' % name)
78
+ text = cleaner(text)
79
+ return text
openvoice/text/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (3.68 kB). View file
 
openvoice/text/__pycache__/cleaners.cpython-310.pyc ADDED
Binary file (1.28 kB). View file
 
openvoice/text/__pycache__/english.cpython-310.pyc ADDED
Binary file (4.7 kB). View file
 
openvoice/text/__pycache__/mandarin.cpython-310.pyc ADDED
Binary file (5.99 kB). View file
 
openvoice/text/__pycache__/symbols.cpython-310.pyc ADDED
Binary file (656 Bytes). View file
 
openvoice/text/cleaners.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from openvoice.text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
3
+ from openvoice.text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
4
+
5
+ def cjke_cleaners2(text):
6
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
7
+ lambda x: chinese_to_ipa(x.group(1))+' ', text)
8
+ text = re.sub(r'\[JA\](.*?)\[JA\]',
9
+ lambda x: japanese_to_ipa2(x.group(1))+' ', text)
10
+ text = re.sub(r'\[KO\](.*?)\[KO\]',
11
+ lambda x: korean_to_ipa(x.group(1))+' ', text)
12
+ text = re.sub(r'\[EN\](.*?)\[EN\]',
13
+ lambda x: english_to_ipa2(x.group(1))+' ', text)
14
+ text = re.sub(r'\s+$', '', text)
15
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
16
+ return text
openvoice/text/english.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ '''
4
+ Cleaners are transformations that run over the input text at both training and eval time.
5
+
6
+ Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7
+ hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8
+ 1. "english_cleaners" for English text
9
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12
+ the symbols in symbols.py to match your data).
13
+ '''
14
+
15
+
16
+ # Regular expression matching whitespace:
17
+
18
+
19
+ import re
20
+ import inflect
21
+ from unidecode import unidecode
22
+ import eng_to_ipa as ipa
23
+ _inflect = inflect.engine()
24
+ _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
25
+ _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
26
+ _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
27
+ _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
28
+ _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
29
+ _number_re = re.compile(r'[0-9]+')
30
+
31
+ # List of (regular expression, replacement) pairs for abbreviations:
32
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
33
+ ('mrs', 'misess'),
34
+ ('mr', 'mister'),
35
+ ('dr', 'doctor'),
36
+ ('st', 'saint'),
37
+ ('co', 'company'),
38
+ ('jr', 'junior'),
39
+ ('maj', 'major'),
40
+ ('gen', 'general'),
41
+ ('drs', 'doctors'),
42
+ ('rev', 'reverend'),
43
+ ('lt', 'lieutenant'),
44
+ ('hon', 'honorable'),
45
+ ('sgt', 'sergeant'),
46
+ ('capt', 'captain'),
47
+ ('esq', 'esquire'),
48
+ ('ltd', 'limited'),
49
+ ('col', 'colonel'),
50
+ ('ft', 'fort'),
51
+ ]]
52
+
53
+
54
+ # List of (ipa, lazy ipa) pairs:
55
+ _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
56
+ ('r', 'ɹ'),
57
+ ('æ', 'e'),
58
+ ('ɑ', 'a'),
59
+ ('ɔ', 'o'),
60
+ ('ð', 'z'),
61
+ ('θ', 's'),
62
+ ('ɛ', 'e'),
63
+ ('ɪ', 'i'),
64
+ ('ʊ', 'u'),
65
+ ('ʒ', 'ʥ'),
66
+ ('ʤ', 'ʥ'),
67
+ ('ˈ', '↓'),
68
+ ]]
69
+
70
+ # List of (ipa, lazy ipa2) pairs:
71
+ _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
72
+ ('r', 'ɹ'),
73
+ ('ð', 'z'),
74
+ ('θ', 's'),
75
+ ('ʒ', 'ʑ'),
76
+ ('ʤ', 'dʑ'),
77
+ ('ˈ', '↓'),
78
+ ]]
79
+
80
+ # List of (ipa, ipa2) pairs
81
+ _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
82
+ ('r', 'ɹ'),
83
+ ('ʤ', 'dʒ'),
84
+ ('ʧ', 'tʃ')
85
+ ]]
86
+
87
+
88
+ def expand_abbreviations(text):
89
+ for regex, replacement in _abbreviations:
90
+ text = re.sub(regex, replacement, text)
91
+ return text
92
+
93
+
94
+ def collapse_whitespace(text):
95
+ return re.sub(r'\s+', ' ', text)
96
+
97
+
98
+ def _remove_commas(m):
99
+ return m.group(1).replace(',', '')
100
+
101
+
102
+ def _expand_decimal_point(m):
103
+ return m.group(1).replace('.', ' point ')
104
+
105
+
106
+ def _expand_dollars(m):
107
+ match = m.group(1)
108
+ parts = match.split('.')
109
+ if len(parts) > 2:
110
+ return match + ' dollars' # Unexpected format
111
+ dollars = int(parts[0]) if parts[0] else 0
112
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
113
+ if dollars and cents:
114
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
115
+ cent_unit = 'cent' if cents == 1 else 'cents'
116
+ return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
117
+ elif dollars:
118
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
119
+ return '%s %s' % (dollars, dollar_unit)
120
+ elif cents:
121
+ cent_unit = 'cent' if cents == 1 else 'cents'
122
+ return '%s %s' % (cents, cent_unit)
123
+ else:
124
+ return 'zero dollars'
125
+
126
+
127
+ def _expand_ordinal(m):
128
+ return _inflect.number_to_words(m.group(0))
129
+
130
+
131
+ def _expand_number(m):
132
+ num = int(m.group(0))
133
+ if num > 1000 and num < 3000:
134
+ if num == 2000:
135
+ return 'two thousand'
136
+ elif num > 2000 and num < 2010:
137
+ return 'two thousand ' + _inflect.number_to_words(num % 100)
138
+ elif num % 100 == 0:
139
+ return _inflect.number_to_words(num // 100) + ' hundred'
140
+ else:
141
+ return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
142
+ else:
143
+ return _inflect.number_to_words(num, andword='')
144
+
145
+
146
+ def normalize_numbers(text):
147
+ text = re.sub(_comma_number_re, _remove_commas, text)
148
+ text = re.sub(_pounds_re, r'\1 pounds', text)
149
+ text = re.sub(_dollars_re, _expand_dollars, text)
150
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
151
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
152
+ text = re.sub(_number_re, _expand_number, text)
153
+ return text
154
+
155
+
156
+ def mark_dark_l(text):
157
+ return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
158
+
159
+
160
+ def english_to_ipa(text):
161
+ text = unidecode(text).lower()
162
+ text = expand_abbreviations(text)
163
+ text = normalize_numbers(text)
164
+ phonemes = ipa.convert(text)
165
+ phonemes = collapse_whitespace(phonemes)
166
+ return phonemes
167
+
168
+
169
+ def english_to_lazy_ipa(text):
170
+ text = english_to_ipa(text)
171
+ for regex, replacement in _lazy_ipa:
172
+ text = re.sub(regex, replacement, text)
173
+ return text
174
+
175
+
176
+ def english_to_ipa2(text):
177
+ text = english_to_ipa(text)
178
+ text = mark_dark_l(text)
179
+ for regex, replacement in _ipa_to_ipa2:
180
+ text = re.sub(regex, replacement, text)
181
+ return text.replace('...', '…')
182
+
183
+
184
+ def english_to_lazy_ipa2(text):
185
+ text = english_to_ipa(text)
186
+ for regex, replacement in _lazy_ipa2:
187
+ text = re.sub(regex, replacement, text)
188
+ return text
openvoice/text/mandarin.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import re
4
+ from pypinyin import lazy_pinyin, BOPOMOFO
5
+ import jieba
6
+ import cn2an
7
+ import logging
8
+
9
+
10
+ # List of (Latin alphabet, bopomofo) pairs:
11
+ _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
12
+ ('a', 'ㄟˉ'),
13
+ ('b', 'ㄅㄧˋ'),
14
+ ('c', 'ㄙㄧˉ'),
15
+ ('d', 'ㄉㄧˋ'),
16
+ ('e', 'ㄧˋ'),
17
+ ('f', 'ㄝˊㄈㄨˋ'),
18
+ ('g', 'ㄐㄧˋ'),
19
+ ('h', 'ㄝˇㄑㄩˋ'),
20
+ ('i', 'ㄞˋ'),
21
+ ('j', 'ㄐㄟˋ'),
22
+ ('k', 'ㄎㄟˋ'),
23
+ ('l', 'ㄝˊㄛˋ'),
24
+ ('m', 'ㄝˊㄇㄨˋ'),
25
+ ('n', 'ㄣˉ'),
26
+ ('o', 'ㄡˉ'),
27
+ ('p', 'ㄆㄧˉ'),
28
+ ('q', 'ㄎㄧㄡˉ'),
29
+ ('r', 'ㄚˋ'),
30
+ ('s', 'ㄝˊㄙˋ'),
31
+ ('t', 'ㄊㄧˋ'),
32
+ ('u', 'ㄧㄡˉ'),
33
+ ('v', 'ㄨㄧˉ'),
34
+ ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
35
+ ('x', 'ㄝˉㄎㄨˋㄙˋ'),
36
+ ('y', 'ㄨㄞˋ'),
37
+ ('z', 'ㄗㄟˋ')
38
+ ]]
39
+
40
+ # List of (bopomofo, romaji) pairs:
41
+ _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
42
+ ('ㄅㄛ', 'p⁼wo'),
43
+ ('ㄆㄛ', 'pʰwo'),
44
+ ('ㄇㄛ', 'mwo'),
45
+ ('ㄈㄛ', 'fwo'),
46
+ ('ㄅ', 'p⁼'),
47
+ ('ㄆ', 'pʰ'),
48
+ ('ㄇ', 'm'),
49
+ ('ㄈ', 'f'),
50
+ ('ㄉ', 't⁼'),
51
+ ('ㄊ', 'tʰ'),
52
+ ('ㄋ', 'n'),
53
+ ('ㄌ', 'l'),
54
+ ('ㄍ', 'k⁼'),
55
+ ('ㄎ', 'kʰ'),
56
+ ('ㄏ', 'h'),
57
+ ('ㄐ', 'ʧ⁼'),
58
+ ('ㄑ', 'ʧʰ'),
59
+ ('ㄒ', 'ʃ'),
60
+ ('ㄓ', 'ʦ`⁼'),
61
+ ('ㄔ', 'ʦ`ʰ'),
62
+ ('ㄕ', 's`'),
63
+ ('ㄖ', 'ɹ`'),
64
+ ('ㄗ', 'ʦ⁼'),
65
+ ('ㄘ', 'ʦʰ'),
66
+ ('ㄙ', 's'),
67
+ ('ㄚ', 'a'),
68
+ ('ㄛ', 'o'),
69
+ ('ㄜ', 'ə'),
70
+ ('ㄝ', 'e'),
71
+ ('ㄞ', 'ai'),
72
+ ('ㄟ', 'ei'),
73
+ ('ㄠ', 'au'),
74
+ ('ㄡ', 'ou'),
75
+ ('ㄧㄢ', 'yeNN'),
76
+ ('ㄢ', 'aNN'),
77
+ ('ㄧㄣ', 'iNN'),
78
+ ('ㄣ', 'əNN'),
79
+ ('ㄤ', 'aNg'),
80
+ ('ㄧㄥ', 'iNg'),
81
+ ('ㄨㄥ', 'uNg'),
82
+ ('ㄩㄥ', 'yuNg'),
83
+ ('ㄥ', 'əNg'),
84
+ ('ㄦ', 'əɻ'),
85
+ ('ㄧ', 'i'),
86
+ ('ㄨ', 'u'),
87
+ ('ㄩ', 'ɥ'),
88
+ ('ˉ', '→'),
89
+ ('ˊ', '↑'),
90
+ ('ˇ', '↓↑'),
91
+ ('ˋ', '↓'),
92
+ ('˙', ''),
93
+ (',', ','),
94
+ ('。', '.'),
95
+ ('!', '!'),
96
+ ('?', '?'),
97
+ ('—', '-')
98
+ ]]
99
+
100
+ # List of (romaji, ipa) pairs:
101
+ _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
102
+ ('ʃy', 'ʃ'),
103
+ ('ʧʰy', 'ʧʰ'),
104
+ ('ʧ⁼y', 'ʧ⁼'),
105
+ ('NN', 'n'),
106
+ ('Ng', 'ŋ'),
107
+ ('y', 'j'),
108
+ ('h', 'x')
109
+ ]]
110
+
111
+ # List of (bopomofo, ipa) pairs:
112
+ _bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
113
+ ('ㄅㄛ', 'p⁼wo'),
114
+ ('ㄆㄛ', 'pʰwo'),
115
+ ('ㄇㄛ', 'mwo'),
116
+ ('ㄈㄛ', 'fwo'),
117
+ ('ㄅ', 'p⁼'),
118
+ ('ㄆ', 'pʰ'),
119
+ ('ㄇ', 'm'),
120
+ ('ㄈ', 'f'),
121
+ ('ㄉ', 't⁼'),
122
+ ('ㄊ', 'tʰ'),
123
+ ('ㄋ', 'n'),
124
+ ('ㄌ', 'l'),
125
+ ('ㄍ', 'k⁼'),
126
+ ('ㄎ', 'kʰ'),
127
+ ('ㄏ', 'x'),
128
+ ('ㄐ', 'tʃ⁼'),
129
+ ('ㄑ', 'tʃʰ'),
130
+ ('ㄒ', 'ʃ'),
131
+ ('ㄓ', 'ts`⁼'),
132
+ ('ㄔ', 'ts`ʰ'),
133
+ ('ㄕ', 's`'),
134
+ ('ㄖ', 'ɹ`'),
135
+ ('ㄗ', 'ts⁼'),
136
+ ('ㄘ', 'tsʰ'),
137
+ ('ㄙ', 's'),
138
+ ('ㄚ', 'a'),
139
+ ('ㄛ', 'o'),
140
+ ('ㄜ', 'ə'),
141
+ ('ㄝ', 'ɛ'),
142
+ ('ㄞ', 'aɪ'),
143
+ ('ㄟ', 'eɪ'),
144
+ ('ㄠ', 'ɑʊ'),
145
+ ('ㄡ', 'oʊ'),
146
+ ('ㄧㄢ', 'jɛn'),
147
+ ('ㄩㄢ', 'ɥæn'),
148
+ ('ㄢ', 'an'),
149
+ ('ㄧㄣ', 'in'),
150
+ ('ㄩㄣ', 'ɥn'),
151
+ ('ㄣ', 'ən'),
152
+ ('ㄤ', 'ɑŋ'),
153
+ ('ㄧㄥ', 'iŋ'),
154
+ ('ㄨㄥ', 'ʊŋ'),
155
+ ('ㄩㄥ', 'jʊŋ'),
156
+ ('ㄥ', 'əŋ'),
157
+ ('ㄦ', 'əɻ'),
158
+ ('ㄧ', 'i'),
159
+ ('ㄨ', 'u'),
160
+ ('ㄩ', 'ɥ'),
161
+ ('ˉ', '→'),
162
+ ('ˊ', '↑'),
163
+ ('ˇ', '↓↑'),
164
+ ('ˋ', '↓'),
165
+ ('˙', ''),
166
+ (',', ','),
167
+ ('。', '.'),
168
+ ('!', '!'),
169
+ ('?', '?'),
170
+ ('—', '-')
171
+ ]]
172
+
173
+ # List of (bopomofo, ipa2) pairs:
174
+ _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
175
+ ('ㄅㄛ', 'pwo'),
176
+ ('ㄆㄛ', 'pʰwo'),
177
+ ('ㄇㄛ', 'mwo'),
178
+ ('ㄈㄛ', 'fwo'),
179
+ ('ㄅ', 'p'),
180
+ ('ㄆ', 'pʰ'),
181
+ ('ㄇ', 'm'),
182
+ ('ㄈ', 'f'),
183
+ ('ㄉ', 't'),
184
+ ('ㄊ', 'tʰ'),
185
+ ('ㄋ', 'n'),
186
+ ('ㄌ', 'l'),
187
+ ('ㄍ', 'k'),
188
+ ('ㄎ', 'kʰ'),
189
+ ('ㄏ', 'h'),
190
+ ('ㄐ', 'tɕ'),
191
+ ('ㄑ', 'tɕʰ'),
192
+ ('ㄒ', 'ɕ'),
193
+ ('ㄓ', 'tʂ'),
194
+ ('ㄔ', 'tʂʰ'),
195
+ ('ㄕ', 'ʂ'),
196
+ ('ㄖ', 'ɻ'),
197
+ ('ㄗ', 'ts'),
198
+ ('ㄘ', 'tsʰ'),
199
+ ('ㄙ', 's'),
200
+ ('ㄚ', 'a'),
201
+ ('ㄛ', 'o'),
202
+ ('ㄜ', 'ɤ'),
203
+ ('ㄝ', 'ɛ'),
204
+ ('ㄞ', 'aɪ'),
205
+ ('ㄟ', 'eɪ'),
206
+ ('ㄠ', 'ɑʊ'),
207
+ ('ㄡ', 'oʊ'),
208
+ ('ㄧㄢ', 'jɛn'),
209
+ ('ㄩㄢ', 'yæn'),
210
+ ('ㄢ', 'an'),
211
+ ('ㄧㄣ', 'in'),
212
+ ('ㄩㄣ', 'yn'),
213
+ ('ㄣ', 'ən'),
214
+ ('ㄤ', 'ɑŋ'),
215
+ ('ㄧㄥ', 'iŋ'),
216
+ ('ㄨㄥ', 'ʊŋ'),
217
+ ('ㄩㄥ', 'jʊŋ'),
218
+ ('ㄥ', 'ɤŋ'),
219
+ ('ㄦ', 'əɻ'),
220
+ ('ㄧ', 'i'),
221
+ ('ㄨ', 'u'),
222
+ ('ㄩ', 'y'),
223
+ ('ˉ', '˥'),
224
+ ('ˊ', '˧˥'),
225
+ ('ˇ', '˨˩˦'),
226
+ ('ˋ', '˥˩'),
227
+ ('˙', ''),
228
+ (',', ','),
229
+ ('。', '.'),
230
+ ('!', '!'),
231
+ ('?', '?'),
232
+ ('—', '-')
233
+ ]]
234
+
235
+
236
+ def number_to_chinese(text):
237
+ numbers = re.findall(r'\d+(?:\.?\d+)?', text)
238
+ for number in numbers:
239
+ text = text.replace(number, cn2an.an2cn(number), 1)
240
+ return text
241
+
242
+
243
+ def chinese_to_bopomofo(text):
244
+ text = text.replace('、', ',').replace(';', ',').replace(':', ',')
245
+ words = jieba.lcut(text, cut_all=False)
246
+ text = ''
247
+ for word in words:
248
+ bopomofos = lazy_pinyin(word, BOPOMOFO)
249
+ if not re.search('[\u4e00-\u9fff]', word):
250
+ text += word
251
+ continue
252
+ for i in range(len(bopomofos)):
253
+ bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
254
+ if text != '':
255
+ text += ' '
256
+ text += ''.join(bopomofos)
257
+ return text
258
+
259
+
260
+ def latin_to_bopomofo(text):
261
+ for regex, replacement in _latin_to_bopomofo:
262
+ text = re.sub(regex, replacement, text)
263
+ return text
264
+
265
+
266
+ def bopomofo_to_romaji(text):
267
+ for regex, replacement in _bopomofo_to_romaji:
268
+ text = re.sub(regex, replacement, text)
269
+ return text
270
+
271
+
272
+ def bopomofo_to_ipa(text):
273
+ for regex, replacement in _bopomofo_to_ipa:
274
+ text = re.sub(regex, replacement, text)
275
+ return text
276
+
277
+
278
+ def bopomofo_to_ipa2(text):
279
+ for regex, replacement in _bopomofo_to_ipa2:
280
+ text = re.sub(regex, replacement, text)
281
+ return text
282
+
283
+
284
+ def chinese_to_romaji(text):
285
+ text = number_to_chinese(text)
286
+ text = chinese_to_bopomofo(text)
287
+ text = latin_to_bopomofo(text)
288
+ text = bopomofo_to_romaji(text)
289
+ text = re.sub('i([aoe])', r'y\1', text)
290
+ text = re.sub('u([aoəe])', r'w\1', text)
291
+ text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
292
+ r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
293
+ text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
294
+ return text
295
+
296
+
297
+ def chinese_to_lazy_ipa(text):
298
+ text = chinese_to_romaji(text)
299
+ for regex, replacement in _romaji_to_ipa:
300
+ text = re.sub(regex, replacement, text)
301
+ return text
302
+
303
+
304
+ def chinese_to_ipa(text):
305
+ text = number_to_chinese(text)
306
+ text = chinese_to_bopomofo(text)
307
+ text = latin_to_bopomofo(text)
308
+ text = bopomofo_to_ipa(text)
309
+ text = re.sub('i([aoe])', r'j\1', text)
310
+ text = re.sub('u([aoəe])', r'w\1', text)
311
+ text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
312
+ r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
313
+ text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
314
+ return text
315
+
316
+
317
+ def chinese_to_ipa2(text):
318
+ text = number_to_chinese(text)
319
+ text = chinese_to_bopomofo(text)
320
+ text = latin_to_bopomofo(text)
321
+ text = bopomofo_to_ipa2(text)
322
+ text = re.sub(r'i([aoe])', r'j\1', text)
323
+ text = re.sub(r'u([aoəe])', r'w\1', text)
324
+ text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
325
+ text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
326
+ return text
openvoice/text/symbols.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Defines the set of symbols used in text input to the model.
3
+ '''
4
+
5
+ # japanese_cleaners
6
+ # _pad = '_'
7
+ # _punctuation = ',.!?-'
8
+ # _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
9
+
10
+
11
+ '''# japanese_cleaners2
12
+ _pad = '_'
13
+ _punctuation = ',.!?-~…'
14
+ _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
15
+ '''
16
+
17
+
18
+ '''# korean_cleaners
19
+ _pad = '_'
20
+ _punctuation = ',.!?…~'
21
+ _letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
22
+ '''
23
+
24
+ '''# chinese_cleaners
25
+ _pad = '_'
26
+ _punctuation = ',。!?—…'
27
+ _letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
28
+ '''
29
+
30
+ # # zh_ja_mixture_cleaners
31
+ # _pad = '_'
32
+ # _punctuation = ',.!?-~…'
33
+ # _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
34
+
35
+
36
+ '''# sanskrit_cleaners
37
+ _pad = '_'
38
+ _punctuation = '।'
39
+ _letters = 'ँंःअआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहऽािीुूृॄेैोौ्ॠॢ '
40
+ '''
41
+
42
+ '''# cjks_cleaners
43
+ _pad = '_'
44
+ _punctuation = ',.!?-~…'
45
+ _letters = 'NQabdefghijklmnopstuvwxyzʃʧʥʦɯɹəɥçɸɾβŋɦː⁼ʰ`^#*=→↓↑ '
46
+ '''
47
+
48
+ '''# thai_cleaners
49
+ _pad = '_'
50
+ _punctuation = '.!? '
51
+ _letters = 'กขฃคฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลวศษสหฬอฮฯะัาำิีึืุูเแโใไๅๆ็่้๊๋์'
52
+ '''
53
+
54
+ # # cjke_cleaners2
55
+ _pad = '_'
56
+ _punctuation = ',.!?-~…'
57
+ _letters = 'NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ '
58
+
59
+
60
+ '''# shanghainese_cleaners
61
+ _pad = '_'
62
+ _punctuation = ',.!?…'
63
+ _letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
64
+ '''
65
+
66
+ '''# chinese_dialect_cleaners
67
+ _pad = '_'
68
+ _punctuation = ',.!?~…─'
69
+ _letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
70
+ '''
71
+
72
+ # Export all symbols:
73
+ symbols = [_pad] + list(_punctuation) + list(_letters)
74
+
75
+ # Special symbol ids
76
+ SPACE_ID = symbols.index(" ")
77
+
78
+ num_ja_tones = 1
79
+ num_kr_tones = 1
80
+ num_zh_tones = 6
81
+ num_en_tones = 4
82
+
83
+ language_tone_start_map = {
84
+ "ZH": 0,
85
+ "JP": num_zh_tones,
86
+ "EN": num_zh_tones + num_ja_tones,
87
+ 'KR': num_zh_tones + num_ja_tones + num_en_tones,
88
+ }
openvoice/transforms.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+
4
+ import numpy as np
5
+
6
+
7
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
8
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
9
+ DEFAULT_MIN_DERIVATIVE = 1e-3
10
+
11
+
12
+ def piecewise_rational_quadratic_transform(
13
+ inputs,
14
+ unnormalized_widths,
15
+ unnormalized_heights,
16
+ unnormalized_derivatives,
17
+ inverse=False,
18
+ tails=None,
19
+ tail_bound=1.0,
20
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
21
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
22
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
23
+ ):
24
+ if tails is None:
25
+ spline_fn = rational_quadratic_spline
26
+ spline_kwargs = {}
27
+ else:
28
+ spline_fn = unconstrained_rational_quadratic_spline
29
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
30
+
31
+ outputs, logabsdet = spline_fn(
32
+ inputs=inputs,
33
+ unnormalized_widths=unnormalized_widths,
34
+ unnormalized_heights=unnormalized_heights,
35
+ unnormalized_derivatives=unnormalized_derivatives,
36
+ inverse=inverse,
37
+ min_bin_width=min_bin_width,
38
+ min_bin_height=min_bin_height,
39
+ min_derivative=min_derivative,
40
+ **spline_kwargs
41
+ )
42
+ return outputs, logabsdet
43
+
44
+
45
+ def searchsorted(bin_locations, inputs, eps=1e-6):
46
+ bin_locations[..., -1] += eps
47
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
48
+
49
+
50
+ def unconstrained_rational_quadratic_spline(
51
+ inputs,
52
+ unnormalized_widths,
53
+ unnormalized_heights,
54
+ unnormalized_derivatives,
55
+ inverse=False,
56
+ tails="linear",
57
+ tail_bound=1.0,
58
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
59
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
60
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
61
+ ):
62
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
63
+ outside_interval_mask = ~inside_interval_mask
64
+
65
+ outputs = torch.zeros_like(inputs)
66
+ logabsdet = torch.zeros_like(inputs)
67
+
68
+ if tails == "linear":
69
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
70
+ constant = np.log(np.exp(1 - min_derivative) - 1)
71
+ unnormalized_derivatives[..., 0] = constant
72
+ unnormalized_derivatives[..., -1] = constant
73
+
74
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
75
+ logabsdet[outside_interval_mask] = 0
76
+ else:
77
+ raise RuntimeError("{} tails are not implemented.".format(tails))
78
+
79
+ (
80
+ outputs[inside_interval_mask],
81
+ logabsdet[inside_interval_mask],
82
+ ) = rational_quadratic_spline(
83
+ inputs=inputs[inside_interval_mask],
84
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
85
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
86
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
87
+ inverse=inverse,
88
+ left=-tail_bound,
89
+ right=tail_bound,
90
+ bottom=-tail_bound,
91
+ top=tail_bound,
92
+ min_bin_width=min_bin_width,
93
+ min_bin_height=min_bin_height,
94
+ min_derivative=min_derivative,
95
+ )
96
+
97
+ return outputs, logabsdet
98
+
99
+
100
+ def rational_quadratic_spline(
101
+ inputs,
102
+ unnormalized_widths,
103
+ unnormalized_heights,
104
+ unnormalized_derivatives,
105
+ inverse=False,
106
+ left=0.0,
107
+ right=1.0,
108
+ bottom=0.0,
109
+ top=1.0,
110
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
113
+ ):
114
+ if torch.min(inputs) < left or torch.max(inputs) > right:
115
+ raise ValueError("Input to a transform is not within its domain")
116
+
117
+ num_bins = unnormalized_widths.shape[-1]
118
+
119
+ if min_bin_width * num_bins > 1.0:
120
+ raise ValueError("Minimal bin width too large for the number of bins")
121
+ if min_bin_height * num_bins > 1.0:
122
+ raise ValueError("Minimal bin height too large for the number of bins")
123
+
124
+ widths = F.softmax(unnormalized_widths, dim=-1)
125
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126
+ cumwidths = torch.cumsum(widths, dim=-1)
127
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128
+ cumwidths = (right - left) * cumwidths + left
129
+ cumwidths[..., 0] = left
130
+ cumwidths[..., -1] = right
131
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132
+
133
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134
+
135
+ heights = F.softmax(unnormalized_heights, dim=-1)
136
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137
+ cumheights = torch.cumsum(heights, dim=-1)
138
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139
+ cumheights = (top - bottom) * cumheights + bottom
140
+ cumheights[..., 0] = bottom
141
+ cumheights[..., -1] = top
142
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
143
+
144
+ if inverse:
145
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
146
+ else:
147
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
148
+
149
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151
+
152
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153
+ delta = heights / widths
154
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
155
+
156
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158
+
159
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
160
+
161
+ if inverse:
162
+ a = (inputs - input_cumheights) * (
163
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
164
+ ) + input_heights * (input_delta - input_derivatives)
165
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
167
+ )
168
+ c = -input_delta * (inputs - input_cumheights)
169
+
170
+ discriminant = b.pow(2) - 4 * a * c
171
+ assert (discriminant >= 0).all()
172
+
173
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
174
+ outputs = root * input_bin_widths + input_cumwidths
175
+
176
+ theta_one_minus_theta = root * (1 - root)
177
+ denominator = input_delta + (
178
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179
+ * theta_one_minus_theta
180
+ )
181
+ derivative_numerator = input_delta.pow(2) * (
182
+ input_derivatives_plus_one * root.pow(2)
183
+ + 2 * input_delta * theta_one_minus_theta
184
+ + input_derivatives * (1 - root).pow(2)
185
+ )
186
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187
+
188
+ return outputs, -logabsdet
189
+ else:
190
+ theta = (inputs - input_cumwidths) / input_bin_widths
191
+ theta_one_minus_theta = theta * (1 - theta)
192
+
193
+ numerator = input_heights * (
194
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195
+ )
196
+ denominator = input_delta + (
197
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198
+ * theta_one_minus_theta
199
+ )
200
+ outputs = input_cumheights + numerator / denominator
201
+
202
+ derivative_numerator = input_delta.pow(2) * (
203
+ input_derivatives_plus_one * theta.pow(2)
204
+ + 2 * input_delta * theta_one_minus_theta
205
+ + input_derivatives * (1 - theta).pow(2)
206
+ )
207
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208
+
209
+ return outputs, logabsdet
openvoice/utils.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import numpy as np
4
+
5
+
6
+ def get_hparams_from_file(config_path):
7
+ with open(config_path, "r", encoding="utf-8") as f:
8
+ data = f.read()
9
+ config = json.loads(data)
10
+
11
+ hparams = HParams(**config)
12
+ return hparams
13
+
14
+ class HParams:
15
+ def __init__(self, **kwargs):
16
+ for k, v in kwargs.items():
17
+ if type(v) == dict:
18
+ v = HParams(**v)
19
+ self[k] = v
20
+
21
+ def keys(self):
22
+ return self.__dict__.keys()
23
+
24
+ def items(self):
25
+ return self.__dict__.items()
26
+
27
+ def values(self):
28
+ return self.__dict__.values()
29
+
30
+ def __len__(self):
31
+ return len(self.__dict__)
32
+
33
+ def __getitem__(self, key):
34
+ return getattr(self, key)
35
+
36
+ def __setitem__(self, key, value):
37
+ return setattr(self, key, value)
38
+
39
+ def __contains__(self, key):
40
+ return key in self.__dict__
41
+
42
+ def __repr__(self):
43
+ return self.__dict__.__repr__()
44
+
45
+
46
+ def string_to_bits(string, pad_len=8):
47
+ # Convert each character to its ASCII value
48
+ ascii_values = [ord(char) for char in string]
49
+
50
+ # Convert ASCII values to binary representation
51
+ binary_values = [bin(value)[2:].zfill(8) for value in ascii_values]
52
+
53
+ # Convert binary strings to integer arrays
54
+ bit_arrays = [[int(bit) for bit in binary] for binary in binary_values]
55
+
56
+ # Convert list of arrays to NumPy array
57
+ numpy_array = np.array(bit_arrays)
58
+ numpy_array_full = np.zeros((pad_len, 8), dtype=numpy_array.dtype)
59
+ numpy_array_full[:, 2] = 1
60
+ max_len = min(pad_len, len(numpy_array))
61
+ numpy_array_full[:max_len] = numpy_array[:max_len]
62
+ return numpy_array_full
63
+
64
+
65
+ def bits_to_string(bits_array):
66
+ # Convert each row of the array to a binary string
67
+ binary_values = [''.join(str(bit) for bit in row) for row in bits_array]
68
+
69
+ # Convert binary strings to ASCII values
70
+ ascii_values = [int(binary, 2) for binary in binary_values]
71
+
72
+ # Convert ASCII values to characters
73
+ output_string = ''.join(chr(value) for value in ascii_values)
74
+
75
+ return output_string
76
+
77
+
78
+ def split_sentence(text, min_len=10, language_str='[EN]'):
79
+ if language_str in ['EN']:
80
+ sentences = split_sentences_latin(text, min_len=min_len)
81
+ else:
82
+ sentences = split_sentences_zh(text, min_len=min_len)
83
+ return sentences
84
+
85
+ def split_sentences_latin(text, min_len=10):
86
+ """Split Long sentences into list of short ones
87
+
88
+ Args:
89
+ str: Input sentences.
90
+
91
+ Returns:
92
+ List[str]: list of output sentences.
93
+ """
94
+ # deal with dirty sentences
95
+ text = re.sub('[。!?;]', '.', text)
96
+ text = re.sub('[,]', ',', text)
97
+ text = re.sub('[“”]', '"', text)
98
+ text = re.sub('[‘’]', "'", text)
99
+ text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
100
+ text = re.sub('[\n\t ]+', ' ', text)
101
+ text = re.sub('([,.!?;])', r'\1 $#!', text)
102
+ # split
103
+ sentences = [s.strip() for s in text.split('$#!')]
104
+ if len(sentences[-1]) == 0: del sentences[-1]
105
+
106
+ new_sentences = []
107
+ new_sent = []
108
+ count_len = 0
109
+ for ind, sent in enumerate(sentences):
110
+ # print(sent)
111
+ new_sent.append(sent)
112
+ count_len += len(sent.split(" "))
113
+ if count_len > min_len or ind == len(sentences) - 1:
114
+ count_len = 0
115
+ new_sentences.append(' '.join(new_sent))
116
+ new_sent = []
117
+ return merge_short_sentences_latin(new_sentences)
118
+
119
+
120
+ def merge_short_sentences_latin(sens):
121
+ """Avoid short sentences by merging them with the following sentence.
122
+
123
+ Args:
124
+ List[str]: list of input sentences.
125
+
126
+ Returns:
127
+ List[str]: list of output sentences.
128
+ """
129
+ sens_out = []
130
+ for s in sens:
131
+ # If the previous sentence is too short, merge them with
132
+ # the current sentence.
133
+ if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
134
+ sens_out[-1] = sens_out[-1] + " " + s
135
+ else:
136
+ sens_out.append(s)
137
+ try:
138
+ if len(sens_out[-1].split(" ")) <= 2:
139
+ sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
140
+ sens_out.pop(-1)
141
+ except:
142
+ pass
143
+ return sens_out
144
+
145
+ def split_sentences_zh(text, min_len=10):
146
+ text = re.sub('[。!?;]', '.', text)
147
+ text = re.sub('[,]', ',', text)
148
+ # 将文本中的换行符、空格和制表符替换为空格
149
+ text = re.sub('[\n\t ]+', ' ', text)
150
+ # 在标点符号后添加一个空格
151
+ text = re.sub('([,.!?;])', r'\1 $#!', text)
152
+ # 分隔句子并去除前后空格
153
+ # sentences = [s.strip() for s in re.split('(。|!|?|;)', text)]
154
+ sentences = [s.strip() for s in text.split('$#!')]
155
+ if len(sentences[-1]) == 0: del sentences[-1]
156
+
157
+ new_sentences = []
158
+ new_sent = []
159
+ count_len = 0
160
+ for ind, sent in enumerate(sentences):
161
+ new_sent.append(sent)
162
+ count_len += len(sent)
163
+ if count_len > min_len or ind == len(sentences) - 1:
164
+ count_len = 0
165
+ new_sentences.append(' '.join(new_sent))
166
+ new_sent = []
167
+ return merge_short_sentences_zh(new_sentences)
168
+
169
+
170
+ def merge_short_sentences_zh(sens):
171
+ # return sens
172
+ """Avoid short sentences by merging them with the following sentence.
173
+
174
+ Args:
175
+ List[str]: list of input sentences.
176
+
177
+ Returns:
178
+ List[str]: list of output sentences.
179
+ """
180
+ sens_out = []
181
+ for s in sens:
182
+ # If the previous sentense is too short, merge them with
183
+ # the current sentence.
184
+ if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
185
+ sens_out[-1] = sens_out[-1] + " " + s
186
+ else:
187
+ sens_out.append(s)
188
+ try:
189
+ if len(sens_out[-1]) <= 2:
190
+ sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
191
+ sens_out.pop(-1)
192
+ except:
193
+ pass
194
+ return sens_out
requirements.txt ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file may be used to create an environment using:
2
+ # $ conda create --name <env> --file <this file>
3
+ # platform: linux-64
4
+ _libgcc_mutex=0.1=main
5
+ _openmp_mutex=5.1=1_gnu
6
+ absl-py=2.1.0=pypi_0
7
+ aiofiles=23.2.1=pypi_0
8
+ altair=5.3.0=pypi_0
9
+ annotated-types=0.7.0=pypi_0
10
+ anyascii=0.3.2=pypi_0
11
+ anyio=4.4.0=pypi_0
12
+ appdirs=1.4.4=pypi_0
13
+ asttokens=2.4.1=pypi_0
14
+ attrs=23.2.0=pypi_0
15
+ audioread=3.0.1=pypi_0
16
+ av=10.0.0=pypi_0
17
+ babel=2.15.0=pypi_0
18
+ backcall=0.2.0=pypi_0
19
+ beautifulsoup4=4.12.3=pypi_0
20
+ bert-extractive-summarizer=0.10.1=pypi_0
21
+ bleach=6.1.0=pypi_0
22
+ blis=0.7.11=pypi_0
23
+ boto3=1.34.143=pypi_0
24
+ botocore=1.34.143=pypi_0
25
+ bzip2=1.0.8=h5eee18b_6
26
+ ca-certificates=2024.7.2=h06a4308_0
27
+ cached-path=1.6.3=pypi_0
28
+ cachetools=5.3.3=pypi_0
29
+ catalogue=2.0.10=pypi_0
30
+ certifi=2024.7.4=pypi_0
31
+ cffi=1.16.0=pypi_0
32
+ charset-normalizer=3.3.2=pypi_0
33
+ click=8.1.7=pypi_0
34
+ cloudpathlib=0.18.1=pypi_0
35
+ cn2an=0.5.22=pypi_0
36
+ coloredlogs=15.0.1=pypi_0
37
+ confection=0.1.5=pypi_0
38
+ contourpy=1.2.1=pypi_0
39
+ cryptography=42.0.8=pypi_0
40
+ ctranslate2=3.24.0=pypi_0
41
+ cycler=0.12.1=pypi_0
42
+ cymem=2.0.8=pypi_0
43
+ cython=3.0.10=pypi_0
44
+ dateparser=1.1.8=pypi_0
45
+ decorator=4.4.2=pypi_0
46
+ deepfilterlib=0.5.6=pypi_0
47
+ deepfilternet=0.5.6=pypi_0
48
+ defusedxml=0.7.1=pypi_0
49
+ deprecated=1.2.14=pypi_0
50
+ dill=0.3.8=pypi_0
51
+ distance=0.1.3=pypi_0
52
+ dnspython=2.6.1=pypi_0
53
+ docopt=0.6.2=pypi_0
54
+ dtw-python=1.4.4=pypi_0
55
+ email-validator=2.2.0=pypi_0
56
+ eng-to-ipa=0.0.2=pypi_0
57
+ exceptiongroup=1.2.1=pypi_0
58
+ executing=2.0.1=pypi_0
59
+ fastapi=0.111.0=pypi_0
60
+ fastapi-cli=0.0.4=pypi_0
61
+ faster-whisper=0.9.0=pypi_0
62
+ fastjsonschema=2.20.0=pypi_0
63
+ ffmpeg-python=0.2.0=pypi_0
64
+ ffmpy=0.3.2=pypi_0
65
+ filelock=3.13.4=pypi_0
66
+ flatbuffers=24.3.25=pypi_0
67
+ fonttools=4.53.1=pypi_0
68
+ fsspec=2024.6.1=pypi_0
69
+ fugashi=1.3.0=pypi_0
70
+ future=1.0.0=pypi_0
71
+ g2p-en=2.1.0=pypi_0
72
+ g2pkk=0.1.2=pypi_0
73
+ google-api-core=2.19.1=pypi_0
74
+ google-auth=2.32.0=pypi_0
75
+ google-cloud-core=2.4.1=pypi_0
76
+ google-cloud-storage=2.17.0=pypi_0
77
+ google-crc32c=1.5.0=pypi_0
78
+ google-resumable-media=2.7.1=pypi_0
79
+ googleapis-common-protos=1.63.2=pypi_0
80
+ gradio=4.38.1=pypi_0
81
+ gradio-client=1.1.0=pypi_0
82
+ grpcio=1.64.1=pypi_0
83
+ gruut=2.2.3=pypi_0
84
+ gruut-ipa=0.13.0=pypi_0
85
+ gruut-lang-de=2.0.1=pypi_0
86
+ gruut-lang-en=2.0.1=pypi_0
87
+ gruut-lang-es=2.0.1=pypi_0
88
+ gruut-lang-fr=2.0.2=pypi_0
89
+ h11=0.14.0=pypi_0
90
+ httpcore=1.0.5=pypi_0
91
+ httptools=0.6.1=pypi_0
92
+ httpx=0.27.0=pypi_0
93
+ huggingface-hub=0.23.4=pypi_0
94
+ humanfriendly=10.0=pypi_0
95
+ idna=3.7=pypi_0
96
+ imageio=2.34.2=pypi_0
97
+ imageio-ffmpeg=0.5.1=pypi_0
98
+ importlib-resources=6.4.0=pypi_0
99
+ inflect=7.0.0=pypi_0
100
+ ipython=8.12.3=pypi_0
101
+ jaconv=0.3.4=pypi_0
102
+ jamo=0.4.1=pypi_0
103
+ jedi=0.19.1=pypi_0
104
+ jieba=0.42.1=pypi_0
105
+ jinja2=3.1.4=pypi_0
106
+ jmespath=1.0.1=pypi_0
107
+ joblib=1.4.2=pypi_0
108
+ jsonlines=1.2.0=pypi_0
109
+ jsonschema=4.23.0=pypi_0
110
+ jsonschema-specifications=2023.12.1=pypi_0
111
+ jupyter-client=8.6.2=pypi_0
112
+ jupyter-core=5.7.2=pypi_0
113
+ jupyterlab-pygments=0.3.0=pypi_0
114
+ kiwisolver=1.4.5=pypi_0
115
+ langcodes=3.4.0=pypi_0
116
+ langid=1.1.6=pypi_0
117
+ language-data=1.2.0=pypi_0
118
+ ld_impl_linux-64=2.38=h1181459_1
119
+ libffi=3.4.4=h6a678d5_1
120
+ libgcc-ng=11.2.0=h1234567_1
121
+ libgomp=11.2.0=h1234567_1
122
+ libretranslatepy=2.1.1=pypi_0
123
+ librosa=0.9.1=pypi_0
124
+ libstdcxx-ng=11.2.0=h1234567_1
125
+ libuuid=1.41.5=h5eee18b_0
126
+ llvmlite=0.43.0=pypi_0
127
+ loguru=0.7.2=pypi_0
128
+ lxml=5.2.2=pypi_0
129
+ marisa-trie=1.2.0=pypi_0
130
+ markdown=3.6=pypi_0
131
+ markdown-it-py=3.0.0=pypi_0
132
+ markupsafe=2.1.5=pypi_0
133
+ matplotlib=3.8.4=pypi_0
134
+ matplotlib-inline=0.1.7=pypi_0
135
+ mdurl=0.1.2=pypi_0
136
+ mecab-python3=1.0.5=pypi_0
137
+ melotts=0.1.2=pypi_0
138
+ mistune=3.0.2=pypi_0
139
+ more-itertools=10.3.0=pypi_0
140
+ moviepy=1.0.3=pypi_0
141
+ mpmath=1.3.0=pypi_0
142
+ multiprocess=0.70.16=pypi_0
143
+ murmurhash=1.0.10=pypi_0
144
+ myshell-openvoice=0.0.0=dev_0
145
+ nbclient=0.10.0=pypi_0
146
+ nbconvert=7.16.4=pypi_0
147
+ nbformat=5.10.4=pypi_0
148
+ ncurses=6.4=h6a678d5_0
149
+ networkx=2.8.8=pypi_0
150
+ nltk=3.8.1=pypi_0
151
+ noisereduce=3.0.2=pypi_0
152
+ num2words=0.5.12=pypi_0
153
+ numba=0.60.0=pypi_0
154
+ numpy=1.22.0=pypi_0
155
+ nvidia-cublas-cu12=12.1.3.1=pypi_0
156
+ nvidia-cuda-cupti-cu12=12.1.105=pypi_0
157
+ nvidia-cuda-nvrtc-cu12=12.1.105=pypi_0
158
+ nvidia-cuda-runtime-cu12=12.1.105=pypi_0
159
+ nvidia-cudnn-cu12=8.9.2.26=pypi_0
160
+ nvidia-cufft-cu12=11.0.2.54=pypi_0
161
+ nvidia-curand-cu12=10.3.2.106=pypi_0
162
+ nvidia-cusolver-cu12=11.4.5.107=pypi_0
163
+ nvidia-cusparse-cu12=12.1.0.106=pypi_0
164
+ nvidia-nccl-cu12=2.20.5=pypi_0
165
+ nvidia-nvjitlink-cu12=12.5.82=pypi_0
166
+ nvidia-nvtx-cu12=12.1.105=pypi_0
167
+ onnxruntime=1.18.1=pypi_0
168
+ openai-whisper=20231117=pypi_0
169
+ openssl=3.0.14=h5eee18b_0
170
+ orjson=3.10.6=pypi_0
171
+ packaging=23.2=pypi_0
172
+ pandas=2.0.3=pypi_0
173
+ pandocfilters=1.5.1=pypi_0
174
+ parso=0.8.4=pypi_0
175
+ pathos=0.3.2=pypi_0
176
+ pexpect=4.9.0=pypi_0
177
+ pickleshare=0.7.5=pypi_0
178
+ pillow=10.4.0=pypi_0
179
+ pip=24.0=py310h06a4308_0
180
+ pipreqs=0.5.0=pypi_0
181
+ plac=1.4.3=pypi_0
182
+ platformdirs=4.2.2=pypi_0
183
+ pooch=1.8.2=pypi_0
184
+ pox=0.3.4=pypi_0
185
+ ppft=1.7.6.8=pypi_0
186
+ preshed=3.0.9=pypi_0
187
+ proces=0.1.7=pypi_0
188
+ proglog=0.1.10=pypi_0
189
+ prompt-toolkit=3.0.47=pypi_0
190
+ proto-plus=1.24.0=pypi_0
191
+ protobuf=5.27.2=pypi_0
192
+ ptyprocess=0.7.0=pypi_0
193
+ pure-eval=0.2.2=pypi_0
194
+ pyasn1=0.6.0=pypi_0
195
+ pyasn1-modules=0.4.0=pypi_0
196
+ pycparser=2.22=pypi_0
197
+ pydantic=2.8.2=pypi_0
198
+ pydantic-core=2.20.1=pypi_0
199
+ pydub=0.25.1=pypi_0
200
+ pyexecjs=1.5.1=pypi_0
201
+ pygments=2.18.0=pypi_0
202
+ pykakasi=2.2.1=pypi_0
203
+ pyparsing=3.1.2=pypi_0
204
+ pypinyin=0.50.0=pypi_0
205
+ python=3.10.14=h955ad1f_1
206
+ python-crfsuite=0.9.10=pypi_0
207
+ python-dateutil=2.9.0.post0=pypi_0
208
+ python-dotenv=1.0.1=pypi_0
209
+ python-mecab-ko=1.3.7=pypi_0
210
+ python-mecab-ko-dic=2.1.1.post2=pypi_0
211
+ python-multipart=0.0.9=pypi_0
212
+ pytz=2024.1=pypi_0
213
+ pyyaml=6.0.1=pypi_0
214
+ pyzmq=26.0.3=pypi_0
215
+ readline=8.2=h5eee18b_0
216
+ referencing=0.35.1=pypi_0
217
+ regex=2024.5.15=pypi_0
218
+ requests=2.32.3=pypi_0
219
+ resampy=0.4.3=pypi_0
220
+ rich=13.7.1=pypi_0
221
+ rpds-py=0.19.0=pypi_0
222
+ rsa=4.9=pypi_0
223
+ ruff=0.5.2=pypi_0
224
+ s3transfer=0.10.2=pypi_0
225
+ scikit-learn=1.5.1=pypi_0
226
+ scipy=1.11.4=pypi_0
227
+ semantic-version=2.10.0=pypi_0
228
+ setuptools=69.5.1=py310h06a4308_0
229
+ shellingham=1.5.4=pypi_0
230
+ six=1.16.0=pypi_0
231
+ smart-open=7.0.4=pypi_0
232
+ sniffio=1.3.1=pypi_0
233
+ soundfile=0.12.1=pypi_0
234
+ soupsieve=2.5=pypi_0
235
+ spacy=3.7.5=pypi_0
236
+ spacy-legacy=3.0.12=pypi_0
237
+ spacy-loggers=1.0.5=pypi_0
238
+ sqlite=3.45.3=h5eee18b_0
239
+ srsly=2.4.8=pypi_0
240
+ stack-data=0.6.3=pypi_0
241
+ starlette=0.37.2=pypi_0
242
+ sympy=1.13.0=pypi_0
243
+ tensorboard=2.16.2=pypi_0
244
+ tensorboard-data-server=0.7.2=pypi_0
245
+ thinc=8.2.5=pypi_0
246
+ threadpoolctl=3.5.0=pypi_0
247
+ tiktoken=0.7.0=pypi_0
248
+ tinycss2=1.3.0=pypi_0
249
+ tk=8.6.14=h39e8969_0
250
+ tokenizers=0.13.3=pypi_0
251
+ tomlkit=0.12.0=pypi_0
252
+ toolz=0.12.1=pypi_0
253
+ torch=2.3.1=pypi_0
254
+ torchaudio=2.3.1+cpu=pypi_0
255
+ tornado=6.4.1=pypi_0
256
+ tqdm=4.66.4=pypi_0
257
+ traitlets=5.14.3=pypi_0
258
+ transformers=4.27.4=pypi_0
259
+ translators=5.9.2=pypi_0
260
+ triton=2.3.1=pypi_0
261
+ txtsplit=1.0.0=pypi_0
262
+ typer=0.12.3=pypi_0
263
+ typing-extensions=4.12.2=pypi_0
264
+ tzdata=2024.1=pypi_0
265
+ tzlocal=5.2=pypi_0
266
+ ujson=5.10.0=pypi_0
267
+ unidecode=1.3.7=pypi_0
268
+ unidic=1.1.0=pypi_0
269
+ unidic-lite=1.0.8=pypi_0
270
+ urllib3=2.2.2=pypi_0
271
+ uvicorn=0.30.1=pypi_0
272
+ uvloop=0.19.0=pypi_0
273
+ wasabi=0.10.1=pypi_0
274
+ watchfiles=0.22.0=pypi_0
275
+ wavmark=0.0.3=pypi_0
276
+ wcwidth=0.2.13=pypi_0
277
+ weasel=0.4.1=pypi_0
278
+ webencodings=0.5.1=pypi_0
279
+ websockets=11.0.3=pypi_0
280
+ werkzeug=3.0.3=pypi_0
281
+ wheel=0.43.0=py310h06a4308_0
282
+ whisper-timestamped=1.14.2=pypi_0
283
+ wrapt=1.16.0=pypi_0
284
+ xz=5.4.6=h5eee18b_1
285
+ yarg=0.1.9=pypi_0
286
+ zlib=1.2.13=h5eee18b_1
setup.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+
4
+ setup(name='MyShell-OpenVoice',
5
+ version='0.0.0',
6
+ description='Instant voice cloning by MyShell.',
7
+ long_description=open('README.md').read().strip(),
8
+ long_description_content_type='text/markdown',
9
+ keywords=[
10
+ 'text-to-speech',
11
+ 'tts',
12
+ 'voice-clone',
13
+ 'zero-shot-tts'
14
+ ],
15
+ url='https://github.com/myshell-ai/OpenVoice',
16
+ project_urls={
17
+ 'Documentation': 'https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md',
18
+ 'Changes': 'https://github.com/myshell-ai/OpenVoice/releases',
19
+ 'Code': 'https://github.com/myshell-ai/OpenVoice',
20
+ 'Issue tracker': 'https://github.com/myshell-ai/OpenVoice/issues',
21
+ },
22
+ author='MyShell',
23
+ author_email='[email protected]',
24
+ license='MIT License',
25
+ packages=find_packages(),
26
+
27
+ python_requires='>=3.9',
28
+ install_requires=[
29
+ 'librosa==0.9.1',
30
+ 'faster-whisper==0.9.0',
31
+ 'pydub==0.25.1',
32
+ 'wavmark==0.0.3',
33
+ 'numpy==1.22.0',
34
+ 'eng_to_ipa==0.0.2',
35
+ 'inflect==7.0.0',
36
+ 'unidecode==1.3.7',
37
+ 'whisper-timestamped==1.14.2',
38
+ 'pypinyin==0.50.0',
39
+ 'cn2an==0.5.22',
40
+ 'jieba==0.42.1',
41
+ 'gradio==3.48.0',
42
+ 'langid==1.1.6'
43
+ ],
44
+ zip_safe=False
45
+ )
videosource.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from openvoice import se_extractor
4
+ from openvoice.api import ToneColorConverter
5
+ import whisper
6
+ from moviepy.editor import VideoFileClip
7
+ from pydub import AudioSegment
8
+ from df.enhance import enhance, init_df, load_audio, save_audio
9
+ import translators as ts
10
+ from melo.api import TTS
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ import ffmpeg
13
+
14
+ # Initialize paths and devices
15
+ ckpt_converter = 'checkpoints_v2/converter'
16
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
17
+ output_dir = 'outputs_v2'
18
+ os.makedirs(output_dir, exist_ok=True)
19
+
20
+ tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
21
+ tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
22
+
23
+ # Process the reference video
24
+ reference_video = VideoFileClip("resources/example1.mp4")
25
+ reference_audio = os.path.join(output_dir, "reference_audio.wav")
26
+ reference_video.audio.write_audiofile(reference_audio)
27
+ audio = AudioSegment.from_file(reference_audio)
28
+ resampled_audio = audio.set_frame_rate(48000)
29
+ resampled_audio.export(reference_audio, format="wav")
30
+
31
+ # Enhance the audio
32
+ model, df_state, _ = init_df()
33
+ audio, _ = load_audio(reference_audio, sr=df_state.sr())
34
+ enhanced = enhance(model, df_state, audio)
35
+ save_audio(reference_audio, enhanced, df_state.sr())
36
+ reference_speaker = reference_audio # This is the voice you want to clone
37
+ target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)
38
+
39
+ src_path = os.path.join(output_dir, "tmp.wav")
40
+
41
+ # Speed is adjustable
42
+ speed = 1.0
43
+
44
+ # Transcribe the original audio with timestamps
45
+ sttmodel = whisper.load_model("base")
46
+ sttresult = sttmodel.transcribe(reference_speaker, verbose=True)
47
+
48
+ # Print the original transcription
49
+ print(sttresult["text"])
50
+ print(sttresult["language"])
51
+
52
+ # Get the segments with start and end times
53
+ segments = sttresult['segments']
54
+
55
+ # Choose the target language for translation
56
+ language = 'EN_NEWEST'
57
+ valid = False
58
+ while not valid:
59
+ valid = True
60
+ choice = input("Choose language to translate to: ")
61
+ match choice:
62
+ case 'en':
63
+ language = 'EN_NEWEST'
64
+ case 'es':
65
+ language = 'ES'
66
+ case 'fr':
67
+ language = 'FR'
68
+ case 'zh-CN':
69
+ language = 'ZH'
70
+ case 'ja':
71
+ language = 'JP'
72
+ case 'ko':
73
+ language = 'KR'
74
+ case _:
75
+ language = 'EN_NEWEST'
76
+
77
+ # Translate the transcription segment by segment
78
+ def translate_segment(segment):
79
+ return segment["start"], segment["end"], ts.translate_text(query_text=segment["text"], translator="google", to_language=choice)
80
+
81
+ # Batch translation to reduce memory load
82
+ batch_size = 2
83
+ translation_segments = []
84
+ for i in range(0, len(segments), batch_size):
85
+ batch = segments[i:i + batch_size]
86
+ with ThreadPoolExecutor(max_workers=5) as executor:
87
+ batch_translations = list(executor.map(translate_segment, batch))
88
+ translation_segments.extend(batch_translations)
89
+
90
+ # Generate the translated audio for each segment
91
+ model = TTS(language=language, device=device)
92
+ speaker_ids = model.hps.data.spk2id
93
+
94
+ def generate_segment_audio(segment, speaker_id):
95
+ start, end, translated_text = segment
96
+ segment_path = os.path.join(output_dir, f'segment_{start}_{end}.wav')
97
+ model.tts_to_file(translated_text, speaker_id, segment_path, speed=speed)
98
+ return segment_path, start, end, translated_text
99
+
100
+ for speaker_key in speaker_ids.keys():
101
+ speaker_id = speaker_ids[speaker_key]
102
+ speaker_key = speaker_key.lower().replace('_', '-')
103
+
104
+ source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
105
+
106
+ segment_files = []
107
+ subtitle_entries = []
108
+ for segment in translation_segments:
109
+ segment_file, start, end, translated_text = generate_segment_audio(segment, speaker_id)
110
+
111
+ # Run the tone color converter
112
+ encode_message = "@MyShell"
113
+ tone_color_converter.convert(
114
+ audio_src_path=segment_file,
115
+ src_se=source_se,
116
+ tgt_se=target_se,
117
+ output_path=segment_file,
118
+ message=encode_message)
119
+
120
+ segment_files.append((segment_file, start, end, translated_text))
121
+
122
+ # Combine the audio segments
123
+ combined_audio = AudioSegment.empty()
124
+ video_segments = []
125
+ previous_end = 0
126
+ subtitle_counter = 1
127
+ for segment_file, start, end, translated_text in segment_files:
128
+ segment_audio = AudioSegment.from_file(segment_file)
129
+ combined_audio += segment_audio
130
+
131
+ # Calculate the duration of the audio segment
132
+ audio_duration = len(segment_audio) / 1000.0
133
+
134
+ # Add the subtitle entry for this segment
135
+ subtitle_entries.append((subtitle_counter, previous_end, previous_end + audio_duration, translated_text))
136
+ subtitle_counter += 1
137
+
138
+ # Get the corresponding video segment and adjust its speed to match the audio duration
139
+ video_segment = (
140
+ ffmpeg
141
+ .input(reference_video.filename, ss=start, to=end)
142
+ .filter('setpts', f'PTS / {(end - start) / audio_duration}')
143
+ )
144
+ video_segments.append((video_segment, ffmpeg.input(segment_file)))
145
+ previous_end += audio_duration
146
+
147
+ save_path = os.path.join(output_dir, f'output_v2_{speaker_key}.wav')
148
+ combined_audio.export(save_path, format="wav")
149
+
150
+ # Combine video and audio segments using ffmpeg
151
+ video_and_audio_files = [item for sublist in video_segments for item in sublist]
152
+ joined = (
153
+ ffmpeg
154
+ .concat(*video_and_audio_files, v=1, a=1)
155
+ .node
156
+ )
157
+
158
+ final_video_path = os.path.join(output_dir, f'final_video_{speaker_key}.mp4')
159
+ try:
160
+ (
161
+ ffmpeg
162
+ .output(joined[0], joined[1], final_video_path, vcodec='libx264', acodec='aac')
163
+ .run(overwrite_output=True)
164
+ )
165
+ except ffmpeg.Error as e:
166
+ print('ffmpeg error:', e)
167
+ print(e.stderr.decode('utf-8'))
168
+
169
+ print(f"Final video without subtitles saved to: {final_video_path}")
170
+
171
+ # Generate subtitles file in SRT format
172
+ srt_path = os.path.join(output_dir, 'subtitles.srt')
173
+ with open(srt_path, 'w', encoding='utf-8') as srt_file:
174
+ for entry in subtitle_entries:
175
+ index, start, end, text = entry
176
+ start_hours, start_minutes = divmod(int(start), 3600)
177
+ start_minutes, start_seconds = divmod(start_minutes, 60)
178
+ start_milliseconds = int((start * 1000) % 1000)
179
+
180
+ end_hours, end_minutes = divmod(int(end), 3600)
181
+ end_minutes, end_seconds = divmod(end_minutes, 60)
182
+ end_milliseconds = int((end * 1000) % 1000)
183
+
184
+ srt_file.write(f"{index}\n")
185
+ srt_file.write(f"{start_hours:02}:{start_minutes:02}:{start_seconds:02},{start_milliseconds:03} --> "
186
+ f"{end_hours:02}:{end_minutes:02}:{end_seconds:02},{end_milliseconds:03}\n")
187
+ srt_file.write(f"{text}\n\n")
188
+
189
+ # Add subtitles to the video
190
+ final_video_with_subs_path = os.path.join(output_dir, f'final_video_with_subs_{speaker_key}.mp4')
191
+ try:
192
+ (
193
+ ffmpeg
194
+ .input(final_video_path)
195
+ .output(final_video_with_subs_path, vf=f"subtitles={srt_path}")
196
+ .run(overwrite_output=True)
197
+ )
198
+ except ffmpeg.Error as e:
199
+ print('ffmpeg error:', e)
200
+ print(e.stderr.decode('utf-8'))
201
+
202
+ print(f"Final video with subtitles saved to: {final_video_with_subs_path}")
videotranslator.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import torch
4
+ from openvoice import se_extractor
5
+ from openvoice.api import ToneColorConverter
6
+ import whisper
7
+ from moviepy.editor import VideoFileClip
8
+ from pydub import AudioSegment
9
+ from df.enhance import enhance, init_df, load_audio, save_audio
10
+ import translators as ts
11
+ from melo.api import TTS
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ import ffmpeg
14
+
15
+ def process_video(video_file, language_choice):
16
+ if video_file == None or language_choice == None:
17
+ return None
18
+
19
+ # Initialize paths and devices
20
+ ckpt_converter = 'checkpoints_v2/converter'
21
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
22
+ output_dir = 'outputs_v2'
23
+ os.makedirs(output_dir, exist_ok=True)
24
+
25
+ tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
26
+ tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
27
+
28
+ # Process the reference video
29
+ reference_video = VideoFileClip(video_file)
30
+ reference_audio = os.path.join(output_dir, "reference_audio.wav")
31
+ reference_video.audio.write_audiofile(reference_audio)
32
+ audio = AudioSegment.from_file(reference_audio)
33
+ resampled_audio = audio.set_frame_rate(48000)
34
+ resampled_audio.export(reference_audio, format="wav")
35
+
36
+ # Enhance the audio
37
+ model, df_state, _ = init_df()
38
+ audio, _ = load_audio(reference_audio, sr=df_state.sr())
39
+ enhanced = enhance(model, df_state, audio)
40
+ save_audio(reference_audio, enhanced, df_state.sr())
41
+ reference_speaker = reference_audio # This is the voice you want to clone
42
+ target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)
43
+
44
+ src_path = os.path.join(output_dir, "tmp.wav")
45
+
46
+ # Speed is adjustable
47
+ speed = 1.0
48
+
49
+ # Transcribe the original audio with timestamps
50
+ sttmodel = whisper.load_model("base")
51
+ sttresult = sttmodel.transcribe(reference_speaker, verbose=True)
52
+
53
+ # Print the original transcription
54
+ print(sttresult["text"])
55
+ print(sttresult["language"])
56
+
57
+ # Get the segments with start and end times
58
+ segments = sttresult['segments']
59
+
60
+ # Choose the target language for translation
61
+ language = 'EN_NEWEST'
62
+ match language_choice:
63
+ case 'en':
64
+ language = 'EN_NEWEST'
65
+ case 'es':
66
+ language = 'ES'
67
+ case 'fr':
68
+ language = 'FR'
69
+ case 'zh':
70
+ language = 'ZH'
71
+ case 'ja':
72
+ language = 'JP'
73
+ case 'ko':
74
+ language = 'KR'
75
+ case _:
76
+ language = 'EN_NEWEST'
77
+
78
+ # Translate the transcription segment by segment
79
+ def translate_segment(segment):
80
+ return segment["start"], segment["end"], ts.translate_text(query_text=segment["text"], translator="google", to_language=language_choice)
81
+
82
+ # Batch translation to reduce memory load
83
+ batch_size = 2
84
+ translation_segments = []
85
+ for i in range(0, len(segments), batch_size):
86
+ batch = segments[i:i + batch_size]
87
+ with ThreadPoolExecutor(max_workers=5) as executor:
88
+ batch_translations = list(executor.map(translate_segment, batch))
89
+ translation_segments.extend(batch_translations)
90
+
91
+ # Generate the translated audio for each segment
92
+ model = TTS(language=language, device=device)
93
+ speaker_ids = model.hps.data.spk2id
94
+
95
+ def generate_segment_audio(segment, speaker_id):
96
+ start, end, translated_text = segment
97
+ segment_path = os.path.join(output_dir, f'segment_{start}_{end}.wav')
98
+ model.tts_to_file(translated_text, speaker_id, segment_path, speed=speed)
99
+ return segment_path, start, end, translated_text
100
+
101
+ for speaker_key in speaker_ids.keys():
102
+ speaker_id = speaker_ids[speaker_key]
103
+ speaker_key = speaker_key.lower().replace('_', '-')
104
+
105
+ source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
106
+
107
+ segment_files = []
108
+ subtitle_entries = []
109
+ for segment in translation_segments:
110
+ segment_file, start, end, translated_text = generate_segment_audio(segment, speaker_id)
111
+
112
+ # Run the tone color converter
113
+ encode_message = "@MyShell"
114
+ tone_color_converter.convert(
115
+ audio_src_path=segment_file,
116
+ src_se=source_se,
117
+ tgt_se=target_se,
118
+ output_path=segment_file,
119
+ message=encode_message)
120
+
121
+ segment_files.append((segment_file, start, end, translated_text))
122
+
123
+ # Combine the audio segments
124
+ combined_audio = AudioSegment.empty()
125
+ video_segments = []
126
+ previous_end = 0
127
+ subtitle_counter = 1
128
+ for segment_file, start, end, translated_text in segment_files:
129
+ segment_audio = AudioSegment.from_file(segment_file)
130
+ combined_audio += segment_audio
131
+
132
+ # Calculate the duration of the audio segment
133
+ audio_duration = len(segment_audio) / 1000.0
134
+
135
+ # Add the subtitle entry for this segment
136
+ subtitle_entries.append((subtitle_counter, previous_end, previous_end + audio_duration, translated_text))
137
+ subtitle_counter += 1
138
+
139
+ # Get the corresponding video segment and adjust its speed to match the audio duration
140
+ video_segment = (
141
+ ffmpeg
142
+ .input(reference_video.filename, ss=start, to=end)
143
+ .filter('setpts', f'PTS / {(end - start) / audio_duration}')
144
+ )
145
+ video_segments.append((video_segment, ffmpeg.input(segment_file)))
146
+ previous_end += audio_duration
147
+
148
+ save_path = os.path.join(output_dir, f'output_v2_{speaker_key}.wav')
149
+ combined_audio.export(save_path, format="wav")
150
+
151
+ # Combine video and audio segments using ffmpeg
152
+ video_and_audio_files = [item for sublist in video_segments for item in sublist]
153
+ joined = (
154
+ ffmpeg
155
+ .concat(*video_and_audio_files, v=1, a=1)
156
+ .node
157
+ )
158
+
159
+ final_video_path = os.path.join(output_dir, f'final_video_{speaker_key}.mp4')
160
+ try:
161
+ (
162
+ ffmpeg
163
+ .output(joined[0], joined[1], final_video_path, vcodec='libx264', acodec='aac')
164
+ .run(overwrite_output=True)
165
+ )
166
+ except ffmpeg.Error as e:
167
+ print('ffmpeg error:', e)
168
+ print(e.stderr.decode('utf-8'))
169
+
170
+ print(f"Final video without subtitles saved to: {final_video_path}")
171
+
172
+ # Generate subtitles file in SRT format
173
+ srt_path = os.path.join(output_dir, 'subtitles.srt')
174
+ with open(srt_path, 'w', encoding='utf-8') as srt_file:
175
+ for entry in subtitle_entries:
176
+ index, start, end, text = entry
177
+ start_hours, start_minutes = divmod(int(start), 3600)
178
+ start_minutes, start_seconds = divmod(start_minutes, 60)
179
+ start_milliseconds = int((start * 1000) % 1000)
180
+
181
+ end_hours, end_minutes = divmod(int(end), 3600)
182
+ end_minutes, end_seconds = divmod(end_minutes, 60)
183
+ end_milliseconds = int((end * 1000) % 1000)
184
+
185
+ srt_file.write(f"{index}\n")
186
+ srt_file.write(f"{start_hours:02}:{start_minutes:02}:{start_seconds:02},{start_milliseconds:03} --> "
187
+ f"{end_hours:02}:{end_minutes:02}:{end_seconds:02},{end_milliseconds:03}\n")
188
+ srt_file.write(f"{text}\n\n")
189
+
190
+ # Add subtitles to the video
191
+ final_video_with_subs_path = os.path.join(output_dir, f'final_video_with_subs_{speaker_key}.mp4')
192
+ try:
193
+ (
194
+ ffmpeg
195
+ .input(final_video_path)
196
+ .output(final_video_with_subs_path, vf=f"subtitles={srt_path}")
197
+ .run(overwrite_output=True)
198
+ )
199
+ except ffmpeg.Error as e:
200
+ print('ffmpeg error:', e)
201
+ print(e.stderr.decode('utf-8'))
202
+
203
+ print(f"Final video with subtitles saved to: {final_video_with_subs_path}")
204
+
205
+ return final_video_with_subs_path
206
+
207
+
208
+ # Define Gradio interface
209
+ def gradio_interface(video_file, language_choice):
210
+ return process_video(video_file, language_choice)
211
+
212
+ language_choices = ts.get_languages("google")["en"]
213
+
214
+ gr.Interface(
215
+ fn=gradio_interface,
216
+ inputs=[
217
+ gr.Video(label="Upload Video"),
218
+ gr.Dropdown(choices=language_choices, label="Choose Language for Translation")
219
+ ],
220
+ outputs=gr.Video(label="Translated Video"),
221
+ title="Video Translation and Voice Cloning",
222
+ description="Upload a video, choose a language to translate the audio, and download the processed video with translated audio."
223
+ ).launch()