Mahiruoshi commited on
Commit
0372395
1 Parent(s): 6510319

Upload 158 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .pre-commit-config.yaml +2 -2
  2. Data/BangDreamV22/configs/config.json +197 -0
  3. Data/BangDreamV22/models/G_51000.pth +3 -0
  4. app.py +175 -272
  5. bert/bert_models.json +2 -2
  6. bert/deberta-v2-large-japanese-char-wwm/.gitattributes +34 -0
  7. bert/deberta-v2-large-japanese-char-wwm/README.md +89 -0
  8. bert/deberta-v2-large-japanese-char-wwm/config.json +37 -0
  9. bert/deberta-v2-large-japanese-char-wwm/pytorch_model.bin +3 -0
  10. bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json +7 -0
  11. bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json +19 -0
  12. bert/deberta-v2-large-japanese-char-wwm/vocab.txt +0 -0
  13. bert_gen.py +8 -7
  14. clap_gen.py +64 -0
  15. clap_wrapper.py +49 -0
  16. commons.py +6 -14
  17. compress_model.py +89 -0
  18. config.py +13 -2
  19. config.yml +35 -18
  20. configs/config.json +865 -99
  21. css/custom.css +18 -0
  22. data_utils.py +29 -9
  23. default_config.yml +35 -18
  24. emotional/clap-htsat-fused/.gitattributes +34 -0
  25. emotional/clap-htsat-fused/README.md +107 -0
  26. emotional/clap-htsat-fused/config.json +207 -0
  27. emotional/clap-htsat-fused/merges.txt +0 -0
  28. emotional/clap-htsat-fused/preprocessor_config.json +22 -0
  29. emotional/clap-htsat-fused/pytorch_model.bin +3 -0
  30. emotional/clap-htsat-fused/special_tokens_map.json +15 -0
  31. emotional/clap-htsat-fused/tokenizer.json +0 -0
  32. emotional/clap-htsat-fused/tokenizer_config.json +16 -0
  33. emotional/clap-htsat-fused/vocab.json +0 -0
  34. empty_emo.npy +3 -0
  35. export_onnx.py +4 -48
  36. filelists/sample.list +3 -0
  37. img/yuyu.png +0 -0
  38. img//345/217/202/346/225/260/350/257/264/346/230/216.png +0 -0
  39. img//345/256/265/345/256/253.png +0 -0
  40. img//345/276/256/344/277/241/345/233/276/347/211/207_20231010105112.png +0 -0
  41. img//347/245/236/351/207/214/347/273/253/345/215/216.png +0 -0
  42. img//347/272/263/350/245/277/345/246/262.png +0 -0
  43. infer.py +186 -12
  44. models.py +77 -6
  45. monotonic_align/__pycache__/__init__.cpython-311.pyc +0 -0
  46. monotonic_align/__pycache__/core.cpython-311.pyc +0 -0
  47. onnx_modules/V200/__init__.py +0 -0
  48. onnx_modules/V200/attentions_onnx.py +378 -0
  49. onnx_modules/V200/models_onnx.py +990 -0
  50. onnx_modules/V200/text/__init__.py +1 -0
.pre-commit-config.yaml CHANGED
@@ -7,13 +7,13 @@ repos:
7
  - id: trailing-whitespace
8
 
9
  - repo: https://github.com/astral-sh/ruff-pre-commit
10
- rev: v0.1.4
11
  hooks:
12
  - id: ruff
13
  args: [ --fix ]
14
 
15
  - repo: https://github.com/psf/black
16
- rev: 23.10.1
17
  hooks:
18
  - id: black
19
 
 
7
  - id: trailing-whitespace
8
 
9
  - repo: https://github.com/astral-sh/ruff-pre-commit
10
+ rev: v0.1.7
11
  hooks:
12
  - id: ruff
13
  args: [ --fix ]
14
 
15
  - repo: https://github.com/psf/black
16
+ rev: 23.11.0
17
  hooks:
18
  - id: black
19
 
Data/BangDreamV22/configs/config.json ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 3000,
5
+ "seed": 42,
6
+ "epochs": 1000,
7
+ "learning_rate": 0.0002,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 10,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.99995,
16
+ "segment_size": 16384,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "skip_optimizer": true,
22
+ "freeze_ZH_bert": false,
23
+ "freeze_JP_bert": false,
24
+ "freeze_EN_bert": false
25
+ },
26
+ "data": {
27
+ "training_files": "Data/BangDream/filelists/train.list",
28
+ "validation_files": "Data/BangDream/filelists/val.list",
29
+ "max_wav_value": 32768.0,
30
+ "sampling_rate": 44100,
31
+ "filter_length": 2048,
32
+ "hop_length": 512,
33
+ "win_length": 2048,
34
+ "n_mel_channels": 128,
35
+ "mel_fmin": 0.0,
36
+ "mel_fmax": null,
37
+ "add_blank": true,
38
+ "n_speakers": 99,
39
+ "cleaned_text": true,
40
+ "spk2id": {
41
+ "香澄": 0,
42
+ "有咲": 1,
43
+ "沙綾": 2,
44
+ "りみ": 3,
45
+ "たえ": 4,
46
+ "沙綾、りみ、たえ": 5,
47
+ "三月七1": 6,
48
+ "紗夜": 7,
49
+ "ロック": 8,
50
+ "パレオ": 9,
51
+ "レイヤ": 10,
52
+ "チュチュ": 11,
53
+ "彩": 12,
54
+ "千聖": 13,
55
+ "イヴ": 14,
56
+ "日菜": 15,
57
+ "麻弥": 16,
58
+ "蘭": 17,
59
+ "モカ": 18,
60
+ "巴": 19,
61
+ "ひまり": 20,
62
+ "つぐみ": 21,
63
+ "はぐみ": 22,
64
+ "花音": 23,
65
+ "美咲": 24,
66
+ "薫": 25,
67
+ "こころ": 26,
68
+ "つくし": 27,
69
+ "七深": 28,
70
+ "透子": 29,
71
+ "ましろ": 30,
72
+ "瑠唯": 31,
73
+ "友希那": 32,
74
+ "あこ": 33,
75
+ "リサ": 34,
76
+ "燐子": 35,
77
+ "燈": 36,
78
+ "愛音": 37,
79
+ "楽奈": 38,
80
+ "そよ": 39,
81
+ "立希": 40,
82
+ "ますき": 41,
83
+ "祥子": 42,
84
+ "睦": 43,
85
+ "海鈴": 44,
86
+ "にゃむ": 45,
87
+ "初華": 46,
88
+ "華戀": 47,
89
+ "晶": 48,
90
+ "光": 49,
91
+ "未知留": 50,
92
+ "香子": 51,
93
+ "雙葉": 52,
94
+ "真晝": 53,
95
+ "艾露": 54,
96
+ "珠緒": 55,
97
+ "艾露露": 56,
98
+ "純那": 57,
99
+ "克洛迪娜": 58,
100
+ "真矢": 59,
101
+ "奈奈": 60,
102
+ "壘": 61,
103
+ "文": 62,
104
+ "一愛": 63,
105
+ "菈樂菲": 64,
106
+ "司": 65,
107
+ "美空": 66,
108
+ "靜羽": 67,
109
+ "悠悠子": 68,
110
+ "八千代": 69,
111
+ "栞": 70,
112
+ "美帆": 71,
113
+ "芙蘿菈": 72,
114
+ "克蕾兒": 73,
115
+ "安德露": 74,
116
+ "瑪莉亞貝菈": 75,
117
+ "克拉迪亞": 76,
118
+ "桃樂西": 77,
119
+ "瑪麗安": 78,
120
+ "八重神子1": 79,
121
+ "娜塔莎": 80,
122
+ "宵宫": 81,
123
+ "派蒙11": 82,
124
+ "派蒙13": 83,
125
+ "派蒙3": 84,
126
+ "派蒙7": 85,
127
+ "派蒙8": 86,
128
+ "派蒙9": 87,
129
+ "派蒙10": 88,
130
+ "派蒙6": 89,
131
+ "派蒙4": 90,
132
+ "派蒙1": 91,
133
+ "派蒙2": 92,
134
+ "派蒙15": 93,
135
+ "派蒙16": 94,
136
+ "派蒙14": 95,
137
+ "派蒙12": 96,
138
+ "派蒙5": 97,
139
+ "纳西妲1": 98
140
+ }
141
+ },
142
+ "model": {
143
+ "use_spk_conditioned_encoder": true,
144
+ "use_noise_scaled_mas": true,
145
+ "use_mel_posterior_encoder": false,
146
+ "use_duration_discriminator": true,
147
+ "inter_channels": 192,
148
+ "hidden_channels": 192,
149
+ "filter_channels": 768,
150
+ "n_heads": 2,
151
+ "n_layers": 6,
152
+ "kernel_size": 3,
153
+ "p_dropout": 0.1,
154
+ "resblock": "1",
155
+ "resblock_kernel_sizes": [
156
+ 3,
157
+ 7,
158
+ 11
159
+ ],
160
+ "resblock_dilation_sizes": [
161
+ [
162
+ 1,
163
+ 3,
164
+ 5
165
+ ],
166
+ [
167
+ 1,
168
+ 3,
169
+ 5
170
+ ],
171
+ [
172
+ 1,
173
+ 3,
174
+ 5
175
+ ]
176
+ ],
177
+ "upsample_rates": [
178
+ 8,
179
+ 8,
180
+ 2,
181
+ 2,
182
+ 2
183
+ ],
184
+ "upsample_initial_channel": 512,
185
+ "upsample_kernel_sizes": [
186
+ 16,
187
+ 16,
188
+ 8,
189
+ 2,
190
+ 2
191
+ ],
192
+ "n_layers_q": 3,
193
+ "use_spectral_norm": false,
194
+ "gin_channels": 256
195
+ },
196
+ "version": "2.2"
197
+ }
Data/BangDreamV22/models/G_51000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:521be4508c8b8b81e81201372cce0ac09cef35ca0f66b3d981f1689a601db3c5
3
+ size 750066550
app.py CHANGED
@@ -1,7 +1,8 @@
1
- # flake8: noqa: E402
2
  import os
3
- import logging
4
 
 
5
  import re_matching
6
 
7
  logging.getLogger("numba").setLevel(logging.WARNING)
@@ -15,34 +16,45 @@ logging.basicConfig(
15
 
16
  logger = logging.getLogger(__name__)
17
 
18
- import warnings
19
- warnings.filterwarnings("ignore", category=UserWarning, module="gradio.blocks")
 
 
 
 
 
 
20
 
21
- import shutil
22
 
23
- from datetime import datetime
24
- import re
25
- import torch
26
- import utils
27
- from infer import infer, latest_version, get_net_g
28
  import gradio as gr
29
- import numpy as np
30
- from tools.sentence import extrac, is_japanese, is_chinese, seconds_to_ass_time, extract_text_from_file, remove_annotations
31
- import sys
32
- import math
33
 
34
- from scipy.io.wavfile import write
 
35
 
36
- from tools.translate import translate
 
 
 
 
37
 
38
- import random
 
 
39
 
40
  net_g = None
41
-
42
- cara_list = ["ひまり","たえ","彩","日菜","美咲","ましろ","燐子","香子","珠緒","たえ"]
43
-
 
 
 
 
 
 
 
 
 
44
  BandList = {
45
-
46
  "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
47
  "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
48
  "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"],
@@ -50,172 +62,123 @@ BandList = {
50
  "Roselia":["友希那","紗夜","リサ","燐子","あこ"],
51
  "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"],
52
  "Morfonica":["ましろ","瑠唯","つくし","七深","透子"],
53
- "MyGo&AveMujica(Part)":["燈","愛音","そよ","立希","楽奈","祥子","睦","海鈴"],
 
54
  "圣翔音乐学园":["華戀","光","香子","雙葉","真晝","純那","克洛迪娜","真矢","奈奈"],
55
  "凛明馆女子学校":["珠緒","壘","文","悠悠子","一愛"],
56
  "弗隆提亚艺术学校":["艾露","艾露露","菈樂菲","司","靜羽"],
57
  "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
58
  }
59
 
60
- device = (
61
- "cuda:0"
62
- if torch.cuda.is_available()
63
- else (
64
- "mps"
65
- if sys.platform == "darwin" and torch.backends.mps.is_available()
66
- else "cpu"
67
- )
68
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- def generate_audio(
71
  text,
72
  sdp_ratio,
73
  noise_scale,
74
  noise_scale_w,
75
  length_scale,
76
- speaker,
77
- language,
 
78
  ):
79
- if len(text)>100:
80
- return
81
- with torch.no_grad():
82
- if language == 'Auto':
83
- language = "EN"
84
- if is_japanese(text):
85
- language = "JP"
86
- elif is_chinese(text):
87
- language = "ZH"
88
- current_time = datetime.now()
89
- print(str(current_time)+':'+str(speaker)+":"+language)
90
- audio = infer(
91
- text,
92
- sdp_ratio=sdp_ratio,
93
- noise_scale=noise_scale,
94
- noise_scale_w=noise_scale_w,
95
- length_scale=length_scale,
96
- sid=speaker,
97
- language=language,
98
- hps=hps,
99
- net_g=net_g,
100
- device=device,
101
- )
102
- return gr.processing_utils.convert_to_16_bit_wav(audio)
103
 
104
- def tts_fn(
105
- text: str,
106
- speaker,
107
- sdp_ratio,
108
- noise_scale,
109
- noise_scale_w,
110
- length_scale,
111
- language,
112
- LongSentence,
113
- ):
114
- if not LongSentence:
115
- with torch.no_grad():
116
- audio = generate_audio(
117
- text,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  sdp_ratio=sdp_ratio,
119
  noise_scale=noise_scale,
120
  noise_scale_w=noise_scale_w,
121
  length_scale=length_scale,
122
- speaker=speaker,
123
- language= language,
124
- )
 
 
 
 
125
  torch.cuda.empty_cache()
126
- return (hps.data.sampling_rate, audio)
127
- else:
128
-
129
- final_list = extrac(text)
130
- audio_fin = []
131
- for sentence in final_list:
132
- if len(sentence) > 1:
133
- with torch.no_grad():
134
- audio = generate_audio(
135
- sentence,
136
- sdp_ratio=sdp_ratio,
137
- noise_scale=noise_scale,
138
- noise_scale_w=noise_scale_w,
139
- length_scale=length_scale,
140
- speaker=speaker,
141
- language= language,
142
- )
143
- silence_frames = int(math.log(len(sentence)+1, 1000) * 44010) if is_chinese(sentence) else int(math.log(len(sentence)+1, 3000) * 44010)
144
- silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
145
- audio_fin.append(audio)
146
- audio_fin.append(silence_data)
147
- return (hps.data.sampling_rate, np.concatenate(audio_fin))
148
-
149
- def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
150
- audio_fin = []
151
- ass_entries = []
152
- start_time = 0
153
- speaker = random.choice(cara_list)
154
- ass_header = """[Script Info]
155
- ; 我没意见
156
- Title: Audiobook
157
- ScriptType: v4.00+
158
- WrapStyle: 0
159
- PlayResX: 640
160
- PlayResY: 360
161
- ScaledBorderAndShadow: yes
162
- [V4+ Styles]
163
- Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
164
- Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
165
- [Events]
166
- Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
167
- """
168
-
169
- for sentence in group:
170
- try:
171
- FakeSpeaker = sentence.split("|")[0]
172
- print(FakeSpeaker)
173
- SpeakersList = re.split('\n', spealerList)
174
- if FakeSpeaker in list(hps.data.spk2id.keys()):
175
- speaker = FakeSpeaker
176
- for i in SpeakersList:
177
- if FakeSpeaker == i.split("|")[1]:
178
- speaker = i.split("|")[0]
179
- if sentence != '\n':
180
- audio = generate_audio(remove_annotations(sentence.split("|")[-1]).replace(" ",""), speaker=speaker, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, language='Auto')
181
- silence_frames = int(silenceTime * 44010)
182
- silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
183
- audio_fin.append(audio)
184
- audio_fin.append(silence_data)
185
-
186
- duration = len(audio) / sampling_rate
187
- end_time = start_time + duration + silenceTime
188
- ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|",":")))
189
- start_time = end_time
190
- except:
191
- pass
192
- wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
193
- ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass')
194
-
195
- write(wav_filename, sampling_rate, np.concatenate(audio_fin))
196
-
197
- with open(ass_filename, 'w', encoding='utf-8') as f:
198
- f.write(ass_header + '\n'.join(ass_entries))
199
- return (hps.data.sampling_rate, np.concatenate(audio_fin))
200
-
201
- def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime,filepath):
202
- directory_path = filepath if torch.cuda.is_available() else "books"
203
-
204
- if os.path.exists(directory_path):
205
- shutil.rmtree(directory_path)
206
 
207
- os.makedirs(directory_path)
208
- text = extract_text_from_file(inputFile.name)
209
- sentences = extrac(text)
210
- GROUP_SIZE = groupsize
211
- for i in range(0, len(sentences), GROUP_SIZE):
212
- group = sentences[i:i+GROUP_SIZE]
213
- if spealerList == "":
214
- spealerList = "无"
215
- result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime)
216
- if not torch.cuda.is_available():
217
- return result
218
- return result
219
 
220
  def loadmodel(model):
221
  _ = net_g.eval()
@@ -223,50 +186,56 @@ def loadmodel(model):
223
  return "success"
224
 
225
  if __name__ == "__main__":
226
- hps = utils.get_hparams_from_file('Data/BangDream/config.json')
227
- version = hps.version if hasattr(hps, "version") else latest_version
228
- net_g = get_net_g(
229
- model_path='Data/BangDream/models/G_33000.pth', version=version, device=device, hps=hps
230
- )
231
- speaker_ids = hps.data.spk2id
232
- speakers = list(speaker_ids.keys())
233
  languages = [ "Auto", "ZH", "JP"]
234
  modelPaths = []
235
- for dirpath, dirnames, filenames in os.walk("Data/BangDream/models/"):
236
  for filename in filenames:
237
  modelPaths.append(os.path.join(dirpath, filename))
 
 
 
 
 
 
238
  with gr.Blocks() as app:
239
- gr.Markdown(value="""
240
- 少歌邦邦全员在线语音合成(Bert-Vits2)\n
241
- 新版本[Mygo&AveMujica](https://huggingface.co/spaces/Mahiruoshi/MyGO_VIts-bert)情感嵌入模型已更新,全员模型将于下个Bert-vits2版本更新
242
- 二创请标注作者:B站@Mahiroshi: https://space.bilibili.com/19874615 ,如果有问题需要反馈可私信联系\n
243
- 声音归属:BangDream及少歌手游\n
244
- !!!注意:huggingface容器仅用作展示,建议克隆本项目后本地运行app.py,环境参考requirements.txt\n
245
- Bert-vits2[项目](https://github.com/Stardust-minus/Bert-VITS2)本身仍然处于开发过程中,因此稳定性存在一定问题。
246
- 关于此模型的使用参考: https://nijigaku.top/2023/10/03/BangDreamTTS\n
247
- [数据集制作](https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/tree/main/%E7%88%AC%E8%99%AB),
248
- [服务器启动示例](https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/blob/main/server.py)\n
249
- 使用本模型请严格遵守法律法规!禁止生成任何有损声优或者企划的内容!!!!!\n
250
- このモデルを使用する際は法律法規を厳守してください!声優や企画に損害を与える内容の生成は禁止です!!!!!\n
251
- Please strictly follow the laws in your country and regulations when using this model! It is prohibited to generate any content that is harmful to others!!!!!\n
252
- 发布二创作品请标注本项目作者及链接、作品使用Bert-VITS2 AI生成!\n
253
- """)
254
  for band in BandList:
255
  with gr.TabItem(band):
256
  for name in BandList[band]:
257
  with gr.TabItem(name):
 
 
 
 
258
  with gr.Row():
259
  with gr.Column():
260
  with gr.Row():
261
  gr.Markdown(
262
  '<div align="center">'
263
- f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
264
  '</div>'
265
  )
266
  length_scale = gr.Slider(
267
  minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
268
  )
269
- LongSentence = gr.Checkbox(value=False, label="自动拆分句子")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  with gr.Accordion(label="切换模型", open=False):
271
  modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
272
  btnMod = gr.Button("载入模型")
@@ -276,99 +245,33 @@ if __name__ == "__main__":
276
  text = gr.TextArea(
277
  label="输入纯日语或者中文",
278
  placeholder="输入纯日语或者中文",
279
- value="有个人躺在地上,哀嚎......\n有个人睡着了,睡在盒子里。\n我要把它打开,看看他的梦是什么。",
280
- )
 
 
 
 
281
  btn = gr.Button("点击生成", variant="primary")
282
  audio_output = gr.Audio(label="Output Audio")
 
283
  btntran = gr.Button("快速中翻日")
284
  translateResult = gr.TextArea("从这复制翻译后的文本")
285
  btntran.click(translate, inputs=[text], outputs = [translateResult])
286
- with gr.Accordion(label="其它参数设定", open=False):
287
- sdp_ratio = gr.Slider(
288
- minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
289
- )
290
- noise_scale = gr.Slider(
291
- minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
292
- )
293
- noise_scale_w = gr.Slider(
294
- minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
295
- )
296
- language = gr.Dropdown(
297
- choices=languages, value=languages[0], label="选择语言(默认自动)"
298
- )
299
- speaker = gr.Dropdown(
300
- choices=speakers, value=name, label="说话人"
301
- )
302
  btn.click(
303
- tts_fn,
304
  inputs=[
305
  text,
306
- speaker,
307
  sdp_ratio,
308
  noise_scale,
309
  noise_scale_w,
310
  length_scale,
311
- language,
312
- LongSentence,
 
313
  ],
314
  outputs=[audio_output],
315
  )
316
 
317
- with gr.Tab('拓展功能'):
318
- with gr.Row():
319
- with gr.Column():
320
- gr.Markdown(
321
- f"从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看自制galgame使用说明\n</a>"
322
- )
323
- inputFile = gr.UploadButton(label="上传txt(可设置角色对应表)、epub或mobi文件")
324
- groupSize = gr.Slider(
325
- minimum=10, maximum=1000 if torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
326
- )
327
- silenceTime = gr.Slider(
328
- minimum=0, maximum=1, value=0.5, step=0.1, label="句子的间隔"
329
- )
330
- filepath = gr.TextArea(
331
- label="本地合成时的音频存储文件夹(会清空文件夹警告)",
332
- value = "D:/audiobook/book1",
333
- )
334
- spealerList = gr.TextArea(
335
- label="角色对应表(example)",
336
- placeholder="左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList1}|{SeakerInUploadText1}\n{ChoseSpeakerFromConfigList2}|{SeakerInUploadText2}\n{ChoseSpeakerFromConfigList3}|{SeakerInUploadText3}\n",
337
- value = "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子",
338
- )
339
- speaker = gr.Dropdown(
340
- choices=speakers, value = "ましろ", label="选择默认说话人"
341
- )
342
- with gr.Column():
343
- sdp_ratio = gr.Slider(
344
- minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
345
- )
346
- noise_scale = gr.Slider(
347
- minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
348
- )
349
- noise_scale_w = gr.Slider(
350
- minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
351
- )
352
- length_scale = gr.Slider(
353
- minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
354
- )
355
- LastAudioOutput = gr.Audio(label="当使用cuda时才能在本地文件夹浏览全部文件")
356
- btn2 = gr.Button("点击生成", variant="primary")
357
- btn2.click(
358
- audiobook,
359
- inputs=[
360
- inputFile,
361
- groupSize,
362
- speaker,
363
- sdp_ratio,
364
- noise_scale,
365
- noise_scale_w,
366
- length_scale,
367
- spealerList,
368
- silenceTime,
369
- filepath
370
- ],
371
- outputs=[LastAudioOutput],
372
- )
373
- print("推理页面已开启!")
374
- app.launch()
 
1
+ import argparse
2
  import os
3
+ from pathlib import Path
4
 
5
+ import logging
6
  import re_matching
7
 
8
  logging.getLogger("numba").setLevel(logging.WARNING)
 
16
 
17
  logger = logging.getLogger(__name__)
18
 
19
+ import librosa
20
+ import numpy as np
21
+ import torch
22
+ import torch.nn as nn
23
+ from torch.utils.data import Dataset
24
+ from torch.utils.data import DataLoader, Dataset
25
+ from tqdm import tqdm
26
+ from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
27
 
 
28
 
 
 
 
 
 
29
  import gradio as gr
 
 
 
 
30
 
31
+ import utils
32
+ from config import config
33
 
34
+ import torch
35
+ import commons
36
+ from text import cleaned_text_to_sequence, get_bert
37
+ from text.cleaner import clean_text
38
+ import utils
39
 
40
+ from models import SynthesizerTrn
41
+ from text.symbols import symbols
42
+ import sys
43
 
44
  net_g = None
45
+ '''
46
+ device = (
47
+ "cuda:0"
48
+ if torch.cuda.is_available()
49
+ else (
50
+ "mps"
51
+ if sys.platform == "darwin" and torch.backends.mps.is_available()
52
+ else "cpu"
53
+ )
54
+ )
55
+ '''
56
+ device = "cpu"
57
  BandList = {
 
58
  "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
59
  "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
60
  "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"],
 
62
  "Roselia":["友希那","紗夜","リサ","燐子","あこ"],
63
  "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"],
64
  "Morfonica":["ましろ","瑠唯","つくし","七深","透子"],
65
+ "MyGo":["燈","愛音","そよ","立希","楽奈"],
66
+ "AveMujica":["祥子","睦","海鈴","にゃむ","初華"],
67
  "圣翔音乐学园":["華戀","光","香子","雙葉","真晝","純那","克洛迪娜","真矢","奈奈"],
68
  "凛明馆女子学校":["珠緒","壘","文","悠悠子","一愛"],
69
  "弗隆提亚艺术学校":["艾露","艾露露","菈樂菲","司","靜羽"],
70
  "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
71
  }
72
 
73
+ def get_net_g(model_path: str, device: str, hps):
74
+ net_g = SynthesizerTrn(
75
+ len(symbols),
76
+ hps.data.filter_length // 2 + 1,
77
+ hps.train.segment_size // hps.data.hop_length,
78
+ n_speakers=hps.data.n_speakers,
79
+ **hps.model,
80
+ ).to(device)
81
+ _ = net_g.eval()
82
+ _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
83
+ return net_g
84
+
85
+ def get_text(text, language_str, hps, device):
86
+ # 在此处实现当前版本的get_text
87
+ norm_text, phone, tone, word2ph = clean_text(text, language_str)
88
+ phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
89
+
90
+ if hps.data.add_blank:
91
+ phone = commons.intersperse(phone, 0)
92
+ tone = commons.intersperse(tone, 0)
93
+ language = commons.intersperse(language, 0)
94
+ for i in range(len(word2ph)):
95
+ word2ph[i] = word2ph[i] * 2
96
+ word2ph[0] += 1
97
+ bert_ori = get_bert(norm_text, word2ph, language_str, device)
98
+ del word2ph
99
+ assert bert_ori.shape[-1] == len(phone), phone
100
+
101
+ if language_str == "ZH":
102
+ bert = bert_ori
103
+ ja_bert = torch.zeros(1024, len(phone))
104
+ en_bert = torch.zeros(1024, len(phone))
105
+ elif language_str == "JP":
106
+ bert = torch.zeros(1024, len(phone))
107
+ ja_bert = bert_ori
108
+ en_bert = torch.zeros(1024, len(phone))
109
+ else:
110
+ raise ValueError("language_str should be ZH, JP or EN")
111
+
112
+ assert bert.shape[-1] == len(
113
+ phone
114
+ ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
115
+
116
+ phone = torch.LongTensor(phone)
117
+ tone = torch.LongTensor(tone)
118
+ language = torch.LongTensor(language)
119
+ return bert, ja_bert, en_bert, phone, tone, language
120
 
121
+ def infer(
122
  text,
123
  sdp_ratio,
124
  noise_scale,
125
  noise_scale_w,
126
  length_scale,
127
+ sid,
128
+ reference_audio=None,
129
+ emotion='Happy',
130
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ language= 'JP' if is_japanese(text) else 'ZH'
133
+ if isinstance(reference_audio, np.ndarray):
134
+ emo = get_clap_audio_feature(reference_audio, device)
135
+ else:
136
+ emo = get_clap_text_feature(emotion, device)
137
+ emo = torch.squeeze(emo, dim=1)
138
+ bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
139
+ text, language, hps, device
140
+ )
141
+ with torch.no_grad():
142
+ x_tst = phones.to(device).unsqueeze(0)
143
+ tones = tones.to(device).unsqueeze(0)
144
+ lang_ids = lang_ids.to(device).unsqueeze(0)
145
+ bert = bert.to(device).unsqueeze(0)
146
+ ja_bert = ja_bert.to(device).unsqueeze(0)
147
+ en_bert = en_bert.to(device).unsqueeze(0)
148
+ x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
149
+ emo = emo.to(device).unsqueeze(0)
150
+ del phones
151
+ speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
152
+ audio = (
153
+ net_g.infer(
154
+ x_tst,
155
+ x_tst_lengths,
156
+ speakers,
157
+ tones,
158
+ lang_ids,
159
+ bert,
160
+ ja_bert,
161
+ en_bert,
162
+ emo,
163
  sdp_ratio=sdp_ratio,
164
  noise_scale=noise_scale,
165
  noise_scale_w=noise_scale_w,
166
  length_scale=length_scale,
167
+ )[0][0, 0]
168
+ .data.cpu()
169
+ .float()
170
+ .numpy()
171
+ )
172
+ del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
173
+ if torch.cuda.is_available():
174
  torch.cuda.empty_cache()
175
+ return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
+ def is_japanese(string):
178
+ for ch in string:
179
+ if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
180
+ return True
181
+ return False
 
 
 
 
 
 
 
182
 
183
  def loadmodel(model):
184
  _ = net_g.eval()
 
186
  return "success"
187
 
188
  if __name__ == "__main__":
 
 
 
 
 
 
 
189
  languages = [ "Auto", "ZH", "JP"]
190
  modelPaths = []
191
+ for dirpath, dirnames, filenames in os.walk('Data/BangDreamV22/models/'):
192
  for filename in filenames:
193
  modelPaths.append(os.path.join(dirpath, filename))
194
+ hps = utils.get_hparams_from_file('Data/BangDreamV22/configs/config.json')
195
+ net_g = get_net_g(
196
+ model_path=modelPaths[-1], device=device, hps=hps
197
+ )
198
+ speaker_ids = hps.data.spk2id
199
+ speakers = list(speaker_ids.keys())
200
  with gr.Blocks() as app:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  for band in BandList:
202
  with gr.TabItem(band):
203
  for name in BandList[band]:
204
  with gr.TabItem(name):
205
+ classifiedPaths = []
206
+ for dirpath, dirnames, filenames in os.walk("Data/Bushiroad/classifedSample/"+name):
207
+ for filename in filenames:
208
+ classifiedPaths.append(os.path.join(dirpath, filename))
209
  with gr.Row():
210
  with gr.Column():
211
  with gr.Row():
212
  gr.Markdown(
213
  '<div align="center">'
214
+ f'<img style="width:auto;height:400px;" src="https://mahiruoshi-bangdream-bert-vits2.hf.space/file/image/{name}.png">'
215
  '</div>'
216
  )
217
  length_scale = gr.Slider(
218
  minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
219
  )
220
+ emotion = gr.Textbox(
221
+ label="Text prompt",
222
+ placeholder="用文字描述生成风格。如:Happy",
223
+ value="Happy",
224
+ visible=True,
225
+ )
226
+ with gr.Accordion(label="参数设定", open=False):
227
+ sdp_ratio = gr.Slider(
228
+ minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
229
+ )
230
+ noise_scale = gr.Slider(
231
+ minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
232
+ )
233
+ noise_scale_w = gr.Slider(
234
+ minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
235
+ )
236
+ speaker = gr.Dropdown(
237
+ choices=speakers, value=name, label="说话人"
238
+ )
239
  with gr.Accordion(label="切换模型", open=False):
240
  modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
241
  btnMod = gr.Button("载入模型")
 
245
  text = gr.TextArea(
246
  label="输入纯日语或者中文",
247
  placeholder="输入纯日语或者中文",
248
+ value="为什么要演奏春日影!",
249
+ )
250
+ try:
251
+ reference_audio = gr.Dropdown(label = "情感参考", choices = classifiedPaths, value = classifiedPaths[0], type = "value")
252
+ except:
253
+ reference_audio = gr.Audio(label="情感参考音频)", type="filepath")
254
  btn = gr.Button("点击生成", variant="primary")
255
  audio_output = gr.Audio(label="Output Audio")
256
+ '''
257
  btntran = gr.Button("快速中翻日")
258
  translateResult = gr.TextArea("从这复制翻译后的文本")
259
  btntran.click(translate, inputs=[text], outputs = [translateResult])
260
+ '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  btn.click(
262
+ infer,
263
  inputs=[
264
  text,
 
265
  sdp_ratio,
266
  noise_scale,
267
  noise_scale_w,
268
  length_scale,
269
+ speaker,
270
+ reference_audio,
271
+ emotion,
272
  ],
273
  outputs=[audio_output],
274
  )
275
 
276
+ print("推理页面已开启!")
277
+ app.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bert/bert_models.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "deberta-v2-large-japanese": {
3
- "repo_id": "ku-nlp/deberta-v2-large-japanese",
4
  "files": ["pytorch_model.bin"]
5
  },
6
  "chinese-roberta-wwm-ext-large": {
 
1
  {
2
+ "deberta-v2-large-japanese-char-wwm": {
3
+ "repo_id": "ku-nlp/deberta-v2-large-japanese-char-wwm",
4
  "files": ["pytorch_model.bin"]
5
  },
6
  "chinese-roberta-wwm-ext-large": {
bert/deberta-v2-large-japanese-char-wwm/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
bert/deberta-v2-large-japanese-char-wwm/README.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ja
3
+ license: cc-by-sa-4.0
4
+ library_name: transformers
5
+ tags:
6
+ - deberta
7
+ - deberta-v2
8
+ - fill-mask
9
+ - character
10
+ - wwm
11
+ datasets:
12
+ - wikipedia
13
+ - cc100
14
+ - oscar
15
+ metrics:
16
+ - accuracy
17
+ mask_token: "[MASK]"
18
+ widget:
19
+ - text: "京都大学で自然言語処理を[MASK][MASK]する。"
20
+ ---
21
+
22
+ # Model Card for Japanese character-level DeBERTa V2 large
23
+
24
+ ## Model description
25
+
26
+ This is a Japanese DeBERTa V2 large model pre-trained on Japanese Wikipedia, the Japanese portion of CC-100, and the Japanese portion of OSCAR.
27
+ This model is trained with character-level tokenization and whole word masking.
28
+
29
+ ## How to use
30
+
31
+ You can use this model for masked language modeling as follows:
32
+
33
+ ```python
34
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
35
+ tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-large-japanese-char-wwm')
36
+ model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-large-japanese-char-wwm')
37
+
38
+ sentence = '京都大学で自然言語処理を[MASK][MASK]する。'
39
+ encoding = tokenizer(sentence, return_tensors='pt')
40
+ ...
41
+ ```
42
+
43
+ You can also fine-tune this model on downstream tasks.
44
+
45
+ ## Tokenization
46
+
47
+ There is no need to tokenize texts in advance, and you can give raw texts to the tokenizer.
48
+ The texts are tokenized into character-level tokens by [sentencepiece](https://github.com/google/sentencepiece).
49
+
50
+ ## Training data
51
+
52
+ We used the following corpora for pre-training:
53
+
54
+ - Japanese Wikipedia (as of 20221020, 3.2GB, 27M sentences, 1.3M documents)
55
+ - Japanese portion of CC-100 (85GB, 619M sentences, 66M documents)
56
+ - Japanese portion of OSCAR (54GB, 326M sentences, 25M documents)
57
+
58
+ Note that we filtered out documents annotated with "header", "footer", or "noisy" tags in OSCAR.
59
+ Also note that Japanese Wikipedia was duplicated 10 times to make the total size of the corpus comparable to that of CC-100 and OSCAR. As a result, the total size of the training data is 171GB.
60
+
61
+ ## Training procedure
62
+
63
+ We first segmented texts in the corpora into words using [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) for whole word masking.
64
+ Then, we built a sentencepiece model with 22,012 tokens including all characters that appear in the training corpus.
65
+
66
+ We tokenized raw corpora into character-level subwords using the sentencepiece model and trained the Japanese DeBERTa model using [transformers](https://github.com/huggingface/transformers) library.
67
+ The training took 26 days using 16 NVIDIA A100-SXM4-40GB GPUs.
68
+
69
+ The following hyperparameters were used during pre-training:
70
+
71
+ - learning_rate: 1e-4
72
+ - per_device_train_batch_size: 26
73
+ - distributed_type: multi-GPU
74
+ - num_devices: 16
75
+ - gradient_accumulation_steps: 8
76
+ - total_train_batch_size: 3,328
77
+ - max_seq_length: 512
78
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-06
79
+ - lr_scheduler_type: linear schedule with warmup (lr = 0 at 300k steps)
80
+ - training_steps: 260,000
81
+ - warmup_steps: 10,000
82
+
83
+ The accuracy of the trained model on the masked language modeling task was 0.795.
84
+ The evaluation set consists of 5,000 randomly sampled documents from each of the training corpora.
85
+
86
+ ## Acknowledgments
87
+
88
+ This work was supported by Joint Usage/Research Center for Interdisciplinary Large-scale Information Infrastructures (JHPCN) through General Collaboration Project no. jh221004, "Developing a Platform for Constructing and Sharing of Large-Scale Japanese Language Models".
89
+ For training models, we used the mdx: a platform for the data-driven future.
bert/deberta-v2-large-japanese-char-wwm/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DebertaV2ForMaskedLM"
4
+ ],
5
+ "attention_head_size": 64,
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "conv_act": "gelu",
8
+ "conv_kernel_size": 3,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-07,
15
+ "max_position_embeddings": 512,
16
+ "max_relative_positions": -1,
17
+ "model_type": "deberta-v2",
18
+ "norm_rel_ebd": "layer_norm",
19
+ "num_attention_heads": 16,
20
+ "num_hidden_layers": 24,
21
+ "pad_token_id": 0,
22
+ "pooler_dropout": 0,
23
+ "pooler_hidden_act": "gelu",
24
+ "pooler_hidden_size": 1024,
25
+ "pos_att_type": [
26
+ "p2c",
27
+ "c2p"
28
+ ],
29
+ "position_biased_input": false,
30
+ "position_buckets": 256,
31
+ "relative_attention": true,
32
+ "share_att_key": true,
33
+ "torch_dtype": "float16",
34
+ "transformers_version": "4.25.1",
35
+ "type_vocab_size": 0,
36
+ "vocab_size": 22012
37
+ }
bert/deberta-v2-large-japanese-char-wwm/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf0dab8ad87bd7c22e85ec71e04f2240804fda6d33196157d6b5923af6ea1201
3
+ size 1318456639
bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": false,
4
+ "do_subword_tokenize": true,
5
+ "do_word_tokenize": true,
6
+ "jumanpp_kwargs": null,
7
+ "mask_token": "[MASK]",
8
+ "mecab_kwargs": null,
9
+ "model_max_length": 1000000000000000019884624838656,
10
+ "never_split": null,
11
+ "pad_token": "[PAD]",
12
+ "sep_token": "[SEP]",
13
+ "special_tokens_map_file": null,
14
+ "subword_tokenizer_type": "character",
15
+ "sudachi_kwargs": null,
16
+ "tokenizer_class": "BertJapaneseTokenizer",
17
+ "unk_token": "[UNK]",
18
+ "word_tokenizer_type": "basic"
19
+ }
bert/deberta-v2-large-japanese-char-wwm/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
bert_gen.py CHANGED
@@ -1,12 +1,14 @@
 
 
 
1
  import torch
2
- from multiprocessing import Pool
 
 
3
  import commons
4
  import utils
5
- from tqdm import tqdm
6
- from text import check_bert_models, cleaned_text_to_sequence, get_bert
7
- import argparse
8
- import torch.multiprocessing as mp
9
  from config import config
 
10
 
11
 
12
  def process_line(line):
@@ -57,7 +59,6 @@ if __name__ == "__main__":
57
  args, _ = parser.parse_known_args()
58
  config_path = args.config
59
  hps = utils.get_hparams_from_file(config_path)
60
- check_bert_models()
61
  lines = []
62
  with open(hps.data.training_files, encoding="utf-8") as f:
63
  lines.extend(f.readlines())
@@ -65,7 +66,7 @@ if __name__ == "__main__":
65
  with open(hps.data.validation_files, encoding="utf-8") as f:
66
  lines.extend(f.readlines())
67
  if len(lines) != 0:
68
- num_processes = args.num_processes
69
  with Pool(processes=num_processes) as pool:
70
  for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
71
  pass
 
1
+ import argparse
2
+ from multiprocessing import Pool, cpu_count
3
+
4
  import torch
5
+ import torch.multiprocessing as mp
6
+ from tqdm import tqdm
7
+
8
  import commons
9
  import utils
 
 
 
 
10
  from config import config
11
+ from text import cleaned_text_to_sequence, get_bert
12
 
13
 
14
  def process_line(line):
 
59
  args, _ = parser.parse_known_args()
60
  config_path = args.config
61
  hps = utils.get_hparams_from_file(config_path)
 
62
  lines = []
63
  with open(hps.data.training_files, encoding="utf-8") as f:
64
  lines.extend(f.readlines())
 
66
  with open(hps.data.validation_files, encoding="utf-8") as f:
67
  lines.extend(f.readlines())
68
  if len(lines) != 0:
69
+ num_processes = min(args.num_processes, cpu_count())
70
  with Pool(processes=num_processes) as pool:
71
  for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
72
  pass
clap_gen.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from multiprocessing import Pool, cpu_count
3
+
4
+ import torch
5
+ import torch.multiprocessing as mp
6
+ from tqdm import tqdm
7
+
8
+ import utils
9
+ from config import config
10
+ from clap_wrapper import get_clap_audio_feature
11
+ import librosa
12
+ import os
13
+
14
+ os.environ["OMP_NUM_THREADS"] = "1"
15
+ os.environ["MKL_NUM_THREADS"] = "1"
16
+
17
+
18
+ def process_line(line):
19
+ device = config.emo_gen_config.device
20
+ if config.emo_gen_config.use_multi_device:
21
+ rank = mp.current_process()._identity
22
+ rank = rank[0] if len(rank) > 0 else 0
23
+ if torch.cuda.is_available():
24
+ gpu_id = rank % torch.cuda.device_count()
25
+ device = torch.device(f"cuda:{gpu_id}")
26
+ else:
27
+ device = torch.device("cpu")
28
+ wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
29
+
30
+ clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy")
31
+ if os.path.isfile(clap_path):
32
+ return
33
+
34
+ audio = librosa.load(wav_path, 48000)[0]
35
+ # audio = librosa.resample(audio, 44100, 48000)
36
+
37
+ clap = get_clap_audio_feature(audio, device)
38
+ torch.save(clap, clap_path)
39
+
40
+
41
+ if __name__ == "__main__":
42
+ parser = argparse.ArgumentParser()
43
+ parser.add_argument(
44
+ "-c", "--config", type=str, default=config.emo_gen_config.config_path
45
+ )
46
+ parser.add_argument(
47
+ "--num_processes", type=int, default=config.emo_gen_config.num_processes
48
+ )
49
+ args, _ = parser.parse_known_args()
50
+ config_path = args.config
51
+ hps = utils.get_hparams_from_file(config_path)
52
+ lines = []
53
+ with open(hps.data.training_files, encoding="utf-8") as f:
54
+ lines.extend(f.readlines())
55
+
56
+ with open(hps.data.validation_files, encoding="utf-8") as f:
57
+ lines.extend(f.readlines())
58
+ if len(lines) != 0:
59
+ num_processes = min(args.num_processes, cpu_count())
60
+ with Pool(processes=num_processes) as pool:
61
+ for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
62
+ pass
63
+
64
+ print(f"clap生成完毕!, 共有{len(lines)}个emo.pt生成!")
clap_wrapper.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import ClapModel, ClapProcessor
5
+
6
+ from config import config
7
+
8
+ models = dict()
9
+ processor = ClapProcessor.from_pretrained("./emotional/clap-htsat-fused")
10
+
11
+
12
+ def get_clap_audio_feature(audio_data, device=config.bert_gen_config.device):
13
+ if (
14
+ sys.platform == "darwin"
15
+ and torch.backends.mps.is_available()
16
+ and device == "cpu"
17
+ ):
18
+ device = "mps"
19
+ if not device:
20
+ device = "cuda"
21
+ if device not in models.keys():
22
+ models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
23
+ device
24
+ )
25
+ with torch.no_grad():
26
+ inputs = processor(
27
+ audios=audio_data, return_tensors="pt", sampling_rate=48000
28
+ ).to(device)
29
+ emb = models[device].get_audio_features(**inputs)
30
+ return emb.T
31
+
32
+
33
+ def get_clap_text_feature(text, device=config.bert_gen_config.device):
34
+ if (
35
+ sys.platform == "darwin"
36
+ and torch.backends.mps.is_available()
37
+ and device == "cpu"
38
+ ):
39
+ device = "mps"
40
+ if not device:
41
+ device = "cuda"
42
+ if device not in models.keys():
43
+ models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
44
+ device
45
+ )
46
+ with torch.no_grad():
47
+ inputs = processor(text=text, return_tensors="pt").to(device)
48
+ emb = models[device].get_text_features(**inputs)
49
+ return emb.T
commons.py CHANGED
@@ -46,26 +46,18 @@ def rand_gumbel_like(x):
46
 
47
 
48
  def slice_segments(x, ids_str, segment_size=4):
49
- ret = torch.zeros_like(x[:, :, :segment_size])
50
- for i in range(x.size(0)):
51
- idx_str = ids_str[i]
52
- idx_end = idx_str + segment_size
53
- if idx_str < 0:
54
- i1 = x.size(2) + idx_str
55
- r1 = x[i, :, i1:]
56
- r2 = x[i, :, :idx_end]
57
- ret[i] = torch.cat([r1, r2], dim=1)
58
- else:
59
- ret[i] = x[i, :, idx_str:idx_end]
60
- return ret
61
 
62
 
63
  def rand_slice_segments(x, x_lengths=None, segment_size=4):
64
  b, d, t = x.size()
65
  if x_lengths is None:
66
  x_lengths = t
67
- ids_str_max = x_lengths - segment_size + 1
68
- ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
69
  ret = slice_segments(x, ids_str, segment_size)
70
  return ret, ids_str
71
 
 
46
 
47
 
48
  def slice_segments(x, ids_str, segment_size=4):
49
+ gather_indices = ids_str.view(x.size(0), 1, 1).repeat(
50
+ 1, x.size(1), 1
51
+ ) + torch.arange(segment_size, device=x.device)
52
+ return torch.gather(x, 2, gather_indices)
 
 
 
 
 
 
 
 
53
 
54
 
55
  def rand_slice_segments(x, x_lengths=None, segment_size=4):
56
  b, d, t = x.size()
57
  if x_lengths is None:
58
  x_lengths = t
59
+ ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
60
+ ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
61
  ret = slice_segments(x, ids_str, segment_size)
62
  return ret, ids_str
63
 
compress_model.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from text.symbols import symbols
3
+ import torch
4
+
5
+ from tools.log import logger
6
+ import utils
7
+ from models import SynthesizerTrn
8
+ import os
9
+
10
+
11
+ def copyStateDict(state_dict):
12
+ if list(state_dict.keys())[0].startswith("module"):
13
+ start_idx = 1
14
+ else:
15
+ start_idx = 0
16
+ new_state_dict = OrderedDict()
17
+ for k, v in state_dict.items():
18
+ name = ",".join(k.split(".")[start_idx:])
19
+ new_state_dict[name] = v
20
+ return new_state_dict
21
+
22
+
23
+ def removeOptimizer(config: str, input_model: str, ishalf: bool, output_model: str):
24
+ hps = utils.get_hparams_from_file(config)
25
+
26
+ net_g = SynthesizerTrn(
27
+ len(symbols),
28
+ hps.data.filter_length // 2 + 1,
29
+ hps.train.segment_size // hps.data.hop_length,
30
+ n_speakers=hps.data.n_speakers,
31
+ **hps.model,
32
+ )
33
+
34
+ optim_g = torch.optim.AdamW(
35
+ net_g.parameters(),
36
+ hps.train.learning_rate,
37
+ betas=hps.train.betas,
38
+ eps=hps.train.eps,
39
+ )
40
+
41
+ state_dict_g = torch.load(input_model, map_location="cpu")
42
+ new_dict_g = copyStateDict(state_dict_g)
43
+ keys = []
44
+ for k, v in new_dict_g["model"].items():
45
+ if "enc_q" in k:
46
+ continue # noqa: E701
47
+ keys.append(k)
48
+
49
+ new_dict_g = (
50
+ {k: new_dict_g["model"][k].half() for k in keys}
51
+ if ishalf
52
+ else {k: new_dict_g["model"][k] for k in keys}
53
+ )
54
+
55
+ torch.save(
56
+ {
57
+ "model": new_dict_g,
58
+ "iteration": 0,
59
+ "optimizer": optim_g.state_dict(),
60
+ "learning_rate": 0.0001,
61
+ },
62
+ output_model,
63
+ )
64
+
65
+
66
+ if __name__ == "__main__":
67
+ import argparse
68
+
69
+ parser = argparse.ArgumentParser()
70
+ parser.add_argument("-c", "--config", type=str, default="configs/config.json")
71
+ parser.add_argument("-i", "--input", type=str)
72
+ parser.add_argument("-o", "--output", type=str, default=None)
73
+ parser.add_argument(
74
+ "-hf", "--half", action="store_true", default=False, help="Save as FP16"
75
+ )
76
+
77
+ args = parser.parse_args()
78
+
79
+ output = args.output
80
+
81
+ if output is None:
82
+ import os.path
83
+
84
+ filename, ext = os.path.splitext(args.input)
85
+ half = "_half" if args.half else ""
86
+ output = filename + "_release" + half + ext
87
+
88
+ removeOptimizer(args.config, args.input, args.half, output)
89
+ logger.info(f"压缩模型成功, 输出模型: {os.path.abspath(output)}")
config.py CHANGED
@@ -38,7 +38,7 @@ class Preprocess_text_config:
38
  train_path: str,
39
  val_path: str,
40
  config_path: str,
41
- val_per_spk: int = 5,
42
  max_val_total: int = 10000,
43
  clean: bool = True,
44
  ):
@@ -47,7 +47,7 @@ class Preprocess_text_config:
47
  self.train_path: str = train_path # 训练集路径,可以不填。不填则将在原始文本目录生成
48
  self.val_path: str = val_path # 验证集路径,可以不填。不填则将在原始文本目录生成
49
  self.config_path: str = config_path # 配置文件路径
50
- self.val_per_spk: int = val_per_spk # 每个speaker的验证集条数
51
  self.max_val_total: int = max_val_total # 验证集最大条数,多于的会被截断并放到训练集中
52
  self.clean: bool = clean # 是否进行数据清洗
53
 
@@ -99,10 +99,12 @@ class Emo_gen_config:
99
  config_path: str,
100
  num_processes: int = 2,
101
  device: str = "cuda",
 
102
  ):
103
  self.config_path = config_path
104
  self.num_processes = num_processes
105
  self.device = device
 
106
 
107
  @classmethod
108
  def from_dict(cls, dataset_path: str, data: Dict[str, any]):
@@ -120,11 +122,17 @@ class Train_ms_config:
120
  env: Dict[str, any],
121
  base: Dict[str, any],
122
  model: str,
 
 
 
123
  ):
124
  self.env = env # 需要加载的环境变量
125
  self.base = base # 底模配置
126
  self.model = model # 训练模型存储目录,该路径为相对于dataset_path的路径,而非项目根目录
127
  self.config_path = config_path # 配置文件路径
 
 
 
128
 
129
  @classmethod
130
  def from_dict(cls, dataset_path: str, data: Dict[str, any]):
@@ -216,6 +224,9 @@ class Config:
216
  self.bert_gen_config: Bert_gen_config = Bert_gen_config.from_dict(
217
  dataset_path, yaml_config["bert_gen"]
218
  )
 
 
 
219
  self.train_ms_config: Train_ms_config = Train_ms_config.from_dict(
220
  dataset_path, yaml_config["train_ms"]
221
  )
 
38
  train_path: str,
39
  val_path: str,
40
  config_path: str,
41
+ val_per_lang: int = 5,
42
  max_val_total: int = 10000,
43
  clean: bool = True,
44
  ):
 
47
  self.train_path: str = train_path # 训练集路径,可以不填。不填则将在原始文本目录生成
48
  self.val_path: str = val_path # 验证集路径,可以不填。不填则将在原始文本目录生成
49
  self.config_path: str = config_path # 配置文件路径
50
+ self.val_per_lang: int = val_per_lang # 每个speaker的验证集条数
51
  self.max_val_total: int = max_val_total # 验证集最大条数,多于的会被截断并放到训练集中
52
  self.clean: bool = clean # 是否进行数据清洗
53
 
 
99
  config_path: str,
100
  num_processes: int = 2,
101
  device: str = "cuda",
102
+ use_multi_device: bool = False,
103
  ):
104
  self.config_path = config_path
105
  self.num_processes = num_processes
106
  self.device = device
107
+ self.use_multi_device = use_multi_device
108
 
109
  @classmethod
110
  def from_dict(cls, dataset_path: str, data: Dict[str, any]):
 
122
  env: Dict[str, any],
123
  base: Dict[str, any],
124
  model: str,
125
+ num_workers: int,
126
+ spec_cache: bool,
127
+ keep_ckpts: int,
128
  ):
129
  self.env = env # 需要加载的环境变量
130
  self.base = base # 底模配置
131
  self.model = model # 训练模型存储目录,该路径为相对于dataset_path的路径,而非项目根目录
132
  self.config_path = config_path # 配置文件路径
133
+ self.num_workers = num_workers # worker数量
134
+ self.spec_cache = spec_cache # 是否启用spec缓存
135
+ self.keep_ckpts = keep_ckpts # ckpt数量
136
 
137
  @classmethod
138
  def from_dict(cls, dataset_path: str, data: Dict[str, any]):
 
224
  self.bert_gen_config: Bert_gen_config = Bert_gen_config.from_dict(
225
  dataset_path, yaml_config["bert_gen"]
226
  )
227
+ self.emo_gen_config: Emo_gen_config = Emo_gen_config.from_dict(
228
+ dataset_path, yaml_config["emo_gen"]
229
+ )
230
  self.train_ms_config: Train_ms_config = Train_ms_config.from_dict(
231
  dataset_path, yaml_config["train_ms"]
232
  )
config.yml CHANGED
@@ -4,7 +4,7 @@
4
  # 拟提供通用路径配置,统一存放数据,避免数据放得很乱
5
  # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径
6
  # 不填或者填空则路径为相对于项目根目录的路径
7
- dataset_path: ""
8
 
9
  # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
10
  mirror: ""
@@ -17,16 +17,16 @@ resample:
17
  sampling_rate: 44100
18
  # 音频文件输入路径,重采样会将该路径下所有.wav音频文件重采样
19
  # 请填入相对于datasetPath的相对路径
20
- in_dir: "" # 相对于根目录的路径为 /datasetPath/in_dir
21
  # 音频文件重采样后输出路径
22
- out_dir: ""
23
 
24
 
25
  # preprocess_text 数据集预处理相关配置
26
  # 注意, “:” 后需要加空格
27
  preprocess_text:
28
  # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
29
- transcription_path: "filelists/bushroid.list"
30
  # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成
31
  cleaned_path: ""
32
  # 训练集路径
@@ -35,10 +35,10 @@ preprocess_text:
35
  val_path: "filelists/val.list"
36
  # 配置文件路径
37
  config_path: "config.json"
38
- # 每个speaker的验证集条数
39
- val_per_spk: 4
40
  # 验证集最大条数,多于的会被截断并放到训练集中
41
- max_val_total: 8
42
  # 是否进行数据清洗
43
  clean: true
44
 
@@ -49,35 +49,51 @@ bert_gen:
49
  # 训练数据集配置文件路径
50
  config_path: "config.json"
51
  # 并行数
52
- num_processes: 2
53
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
54
  # 该选项同时决定了get_bert_feature的默认设备
55
  device: "cuda"
56
  # 使用多卡推理
57
  use_multi_device: false
58
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # train 训练配置
61
  # 注意, “:” 后需要加空格
62
  train_ms:
63
- # 需要加载的环境变量,多显卡训练时RANK请手动在环境变量填写
64
- # 环境变量对应名称环境变量不存在时加载,也就是说手动添加的环境变量优先级更高,会覆盖本配置文件
65
  env:
66
  MASTER_ADDR: "localhost"
67
  MASTER_PORT: 10086
68
  WORLD_SIZE: 1
 
69
  RANK: 0
70
  # 可以填写任意名的环境变量
71
  # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
72
  # 底模设置
73
  base:
74
- use_base_model: True
75
  repo_id: "Stardust_minus/Bert-VITS2"
76
- model_image: "Bert-VITS2中日英底模-fix" # openi网页的模型名
77
  # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
78
  model: "models"
79
  # 配置文件路径
80
- config_path: "configs/config.json"
 
 
 
 
 
 
81
 
82
 
83
  # webui webui配置
@@ -86,9 +102,9 @@ webui:
86
  # 推理设备
87
  device: "cuda"
88
  # 模型路径
89
- model: "genshin/models/G_8000.pth"
90
  # 配置文件路径
91
- config_path: "configs/config.json"
92
  # 端口号
93
  port: 7860
94
  # 是否公开部署,对外网开放
@@ -99,7 +115,7 @@ webui:
99
  language_identification_library: "langid"
100
 
101
 
102
- # server api配置
103
  # 注意, “:” 后需要加空格
104
  # 注意,本配置下的所有配置均为相对于根目录的路径
105
  server:
@@ -107,8 +123,10 @@ server:
107
  port: 5000
108
  # 模型默认使用设备:但是当前并没有实现这个配置。
109
  device: "cuda"
110
- # 需要加载的所有模型的配置
 
111
  # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
 
112
  models:
113
  - # 模型的路径
114
  model: ""
@@ -149,7 +167,6 @@ server:
149
  # 不必填写所有人物,不填的使用默认值
150
  speakers: [ ] # 也可以不填
151
 
152
-
153
  # 百度翻译开放平台 api配置
154
  # api接入文档 https://api.fanyi.baidu.com/doc/21
155
  # 请不要在github等网站公开分享你的app id 与 key
 
4
  # 拟提供通用路径配置,统一存放数据,避免数据放得很乱
5
  # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径
6
  # 不填或者填空则路径为相对于项目根目录的路径
7
+ dataset_path: "Data/"
8
 
9
  # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
10
  mirror: ""
 
17
  sampling_rate: 44100
18
  # 音频文件输入路径,重采样会将该路径下所有.wav音频文件重采样
19
  # 请填入相对于datasetPath的相对路径
20
+ in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir
21
  # 音频文件重采样后输出路径
22
+ out_dir: "audios/wavs"
23
 
24
 
25
  # preprocess_text 数据集预处理相关配置
26
  # 注意, “:” 后需要加空格
27
  preprocess_text:
28
  # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
29
+ transcription_path: "filelists/你的数据集文本.list"
30
  # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成
31
  cleaned_path: ""
32
  # 训练集路径
 
35
  val_path: "filelists/val.list"
36
  # 配置文件路径
37
  config_path: "config.json"
38
+ # 每个语言的验证集条数
39
+ val_per_lang: 4
40
  # 验证集最大条数,多于的会被截断并放到训练集中
41
+ max_val_total: 12
42
  # 是否进行数据清洗
43
  clean: true
44
 
 
49
  # 训练数据集配置文件路径
50
  config_path: "config.json"
51
  # 并行数
52
+ num_processes: 4
53
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
54
  # 该选项同时决定了get_bert_feature的默认设备
55
  device: "cuda"
56
  # 使用多卡推理
57
  use_multi_device: false
58
 
59
+ # emo_gen 相关配置
60
+ # 注意, “:” 后需要加空格
61
+ emo_gen:
62
+ # 训练数据集配置文件路径
63
+ config_path: "config.json"
64
+ # 并行数
65
+ num_processes: 4
66
+ # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
67
+ device: "cuda"
68
+ # 使用多卡推理
69
+ use_multi_device: false
70
 
71
  # train 训练配置
72
  # 注意, “:” 后需要加空格
73
  train_ms:
 
 
74
  env:
75
  MASTER_ADDR: "localhost"
76
  MASTER_PORT: 10086
77
  WORLD_SIZE: 1
78
+ LOCAL_RANK: 0
79
  RANK: 0
80
  # 可以填写任意名的环境变量
81
  # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
82
  # 底模设置
83
  base:
84
+ use_base_model: false
85
  repo_id: "Stardust_minus/Bert-VITS2"
86
+ model_image: "Bert-VITS2_2.2-Clap底模" # openi网页的模型名
87
  # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
88
  model: "models"
89
  # 配置文件路径
90
+ config_path: "config.json"
91
+ # 训练使用的worker,不建议超过CPU核心数
92
+ num_workers: 16
93
+ # 关闭此项可以节约接近50%的磁盘空间,但是可能导致实际训练速度变慢和更高的CPU使用率。
94
+ spec_cache: True
95
+ # 保存的检查点数量,多于此数目的权重会被删除来节省空间。
96
+ keep_ckpts: 8
97
 
98
 
99
  # webui webui配置
 
102
  # 推理设备
103
  device: "cuda"
104
  # 模型路径
105
+ model: "models/G_8000.pth"
106
  # 配置文件路径
107
+ config_path: "config.json"
108
  # 端口号
109
  port: 7860
110
  # 是否公开部署,对外网开放
 
115
  language_identification_library: "langid"
116
 
117
 
118
+ # server-fastapi配置
119
  # 注意, “:” 后需要加空格
120
  # 注意,本配置下的所有配置均为相对于根目录的路径
121
  server:
 
123
  port: 5000
124
  # 模型默认使用设备:但是当前并没有实现这个配置。
125
  device: "cuda"
126
+ # 需要加载的所有模型的配置,可以填多个模型,也可以不填模型,等网页成功后手动加载模型
127
+ # 不加载模型的配置格式:删除默认给的两个模型配置,给models赋值 [ ],也就是空列表。参考模型2的speakers 即 models: [ ]
128
  # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
129
+ # 也可以不填模型,等网页加载成功后手动填写models。
130
  models:
131
  - # 模型的路径
132
  model: ""
 
167
  # 不必填写所有人物,不填的使用默认值
168
  speakers: [ ] # 也可以不填
169
 
 
170
  # 百度翻译开放平台 api配置
171
  # api接入文档 https://api.fanyi.baidu.com/doc/21
172
  # 请不要在github等网站公开分享你的app id 与 key
configs/config.json CHANGED
@@ -10,7 +10,7 @@
10
  0.99
11
  ],
12
  "eps": 1e-09,
13
- "batch_size": 16,
14
  "fp16_run": false,
15
  "lr_decay": 0.99995,
16
  "segment_size": 16384,
@@ -18,11 +18,14 @@
18
  "warmup_epochs": 0,
19
  "c_mel": 45,
20
  "c_kl": 1.0,
21
- "skip_optimizer": true
 
 
 
22
  },
23
  "data": {
24
- "training_files": "Data/BangDream/filelists/train.list",
25
- "validation_files": "Data/BangDream/filelists/val.list",
26
  "max_wav_value": 32768.0,
27
  "sampling_rate": 44100,
28
  "filter_length": 2048,
@@ -32,101 +35,864 @@
32
  "mel_fmin": 0.0,
33
  "mel_fmax": null,
34
  "add_blank": true,
35
- "n_speakers": 700,
36
  "cleaned_text": true,
37
  "spk2id": {
38
- "華戀": 0,
39
- "": 1,
40
- "": 2,
41
- "未知留": 3,
42
- "香子": 4,
43
- "雙葉": 5,
44
- "真晝": 6,
45
- "艾露": 7,
46
- "珠緒": 8,
47
- "艾露露": 9,
48
- "純那": 10,
49
- "克洛迪娜": 11,
50
- "真矢": 12,
51
- "奈奈": 13,
52
- "": 14,
53
- "": 15,
54
- "一愛": 16,
55
- "菈樂菲": 17,
56
- "": 18,
57
- "美空": 19,
58
- "靜羽": 20,
59
- "悠悠子": 21,
60
- "八千代": 22,
61
- "": 23,
62
- "美帆": 24,
63
- "芙蘿菈": 25,
64
- "克蕾兒": 26,
65
- "安德露": 27,
66
- "瑪莉亞貝菈": 28,
67
- "克拉迪亞": 29,
68
- "桃樂西": 30,
69
- "瑪麗安": 31,
70
- "三月七": 32,
71
- "香澄": 33,
72
- "有咲": 34,
73
- "沙綾": 35,
74
- "りみ": 36,
75
- "たえ": 37,
76
- "沙綾、りみ、たえ": 38,
77
- "": 39,
78
- "一同": 40,
79
- "まりな": 41,
80
- "ゆり": 42,
81
- "明日香": 43,
82
- "???": 44,
83
- "ひまり": 45,
84
- "モカ": 46,
85
- "つぐみ": 47,
86
- "": 48,
87
- "リサ": 49,
88
- "千聖": 50,
89
- "花音": 51,
90
- "イヴ": 52,
91
- "日菜": 53,
92
- "友希那": 54,
93
- "紗夜": 55,
94
- "こころ": 56,
95
- "美咲": 57,
96
- "": 58,
97
- "はぐみ": 59,
98
- "ミッシェル": 60,
99
- "マリー": 61,
100
- "怪盗ハロハッピー": 62,
101
- "ニコリーナ": 63,
102
- "": 64,
103
- "麻弥": 65,
104
- "燐子": 66,
105
- "あこ": 67,
106
- "ゆきな": 68,
107
- "ましろ": 69,
108
- "つくし": 70,
109
- "透子": 71,
110
- "七深": 72,
111
- "瑠唯": 73,
112
- "六花": 74,
113
- "パレオ": 75,
114
- "レイヤ": 76,
115
- "マスキング": 77,
116
- "チュチュ": 78,
117
- "ますき": 79,
118
- "ロック": 80,
119
- "令王那": 81,
120
- "CHIYU": 82,
121
- "レイ": 83,
122
- "": 84,
123
- "そよ": 85,
124
- "祥子": 86,
125
- "立希": 87,
126
- "": 88,
127
- "愛音": 89,
128
- "楽奈": 90,
129
- "海鈴": 91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  }
131
  },
132
  "model": {
@@ -183,5 +949,5 @@
183
  "use_spectral_norm": false,
184
  "gin_channels": 256
185
  },
186
- "version": "2.0"
187
- }
 
10
  0.99
11
  ],
12
  "eps": 1e-09,
13
+ "batch_size": 12,
14
  "fp16_run": false,
15
  "lr_decay": 0.99995,
16
  "segment_size": 16384,
 
18
  "warmup_epochs": 0,
19
  "c_mel": 45,
20
  "c_kl": 1.0,
21
+ "skip_optimizer": true,
22
+ "freeze_ZH_bert": false,
23
+ "freeze_JP_bert": false,
24
+ "freeze_EN_bert": false
25
  },
26
  "data": {
27
+ "training_files": "filelists/train.list",
28
+ "validation_files": "filelists/val.list",
29
  "max_wav_value": 32768.0,
30
  "sampling_rate": 44100,
31
  "filter_length": 2048,
 
35
  "mel_fmin": 0.0,
36
  "mel_fmax": null,
37
  "add_blank": true,
38
+ "n_speakers": 896,
39
  "cleaned_text": true,
40
  "spk2id": {
41
+ "派蒙_ZH": 0,
42
+ "纳西妲_ZH": 1,
43
+ "凯亚_ZH": 2,
44
+ "阿贝多_ZH": 3,
45
+ "温迪_ZH": 4,
46
+ "枫原万叶_ZH": 5,
47
+ "钟离_ZH": 6,
48
+ "荒泷一斗_ZH": 7,
49
+ "八重神子_ZH": 8,
50
+ "艾尔海森_ZH": 9,
51
+ "提纳里_ZH": 10,
52
+ "迪希雅_ZH": 11,
53
+ "卡维_ZH": 12,
54
+ "宵宫_ZH": 13,
55
+ "那维莱特_ZH": 14,
56
+ "莱依拉_ZH": 15,
57
+ "赛诺_ZH": 16,
58
+ "莫娜_ZH": 17,
59
+ "诺艾尔_ZH": 18,
60
+ "托马_ZH": 19,
61
+ "凝光_ZH": 20,
62
+ "林尼_ZH": 21,
63
+ "北斗_ZH": 22,
64
+ "柯莱_ZH": 23,
65
+ "神里绫华_ZH": 24,
66
+ "可莉_ZH": 25,
67
+ "芭芭拉_ZH": 26,
68
+ "雷电将军_ZH": 27,
69
+ "娜维娅_ZH": 28,
70
+ "芙宁娜_ZH": 29,
71
+ "珊瑚宫心海_ZH": 30,
72
+ "鹿野院平藏_ZH": 31,
73
+ "迪奥娜_ZH": 32,
74
+ "琴_ZH": 33,
75
+ "五郎_ZH": 34,
76
+ "班尼特_ZH": 35,
77
+ "达达利亚_ZH": 36,
78
+ "安柏_ZH": 37,
79
+ "莱欧斯利_ZH": 38,
80
+ "夜兰_ZH": 39,
81
+ "妮露_ZH": 40,
82
+ "辛焱_ZH": 41,
83
+ "丽莎_ZH": 42,
84
+ "珐露珊_ZH": 43,
85
+ "魈_ZH": 44,
86
+ "香菱_ZH": 45,
87
+ "迪卢克_ZH": 46,
88
+ "砂糖_ZH": 47,
89
+ "烟绯_ZH": 48,
90
+ "早柚_ZH": 49,
91
+ "云堇_ZH": 50,
92
+ "刻晴_ZH": 51,
93
+ "重云_ZH": 52,
94
+ "优菈_ZH": 53,
95
+ "胡桃_ZH": 54,
96
+ "流浪者_ZH": 55,
97
+ "久岐忍_ZH": 56,
98
+ "神里绫人_ZH": 57,
99
+ "甘雨_ZH": 58,
100
+ "戴因斯雷布_ZH": 59,
101
+ "菲谢尔_ZH": 60,
102
+ "白术_ZH": 61,
103
+ "行秋_ZH": 62,
104
+ "九条裟罗_ZH": 63,
105
+ "夏洛蒂_ZH": 64,
106
+ "雷泽_ZH": 65,
107
+ "申鹤_ZH": 66,
108
+ "荧_ZH": 67,
109
+ "空_ZH": 68,
110
+ "迪娜泽黛_ZH": 69,
111
+ "凯瑟琳_ZH": 70,
112
+ "多莉_ZH": 71,
113
+ "坎蒂丝_ZH": 72,
114
+ "琳妮特_ZH": 73,
115
+ "萍姥姥_ZH": 74,
116
+ "罗莎莉亚_ZH": 75,
117
+ "埃德_ZH": 76,
118
+ "爱贝尔_ZH": 77,
119
+ "伊迪娅_ZH": 78,
120
+ "留云借风真君_ZH": 79,
121
+ "绮良良_ZH": 80,
122
+ "七七_ZH": 81,
123
+ "式大将_ZH": 82,
124
+ "瑶瑶_ZH": 83,
125
+ "奥兹_ZH": 84,
126
+ "菲米尼_ZH": 85,
127
+ "米卡_ZH": 86,
128
+ "哲平_ZH": 87,
129
+ "大肉丸_ZH": 88,
130
+ "托克_ZH": 89,
131
+ "蒂玛乌斯_ZH": 90,
132
+ "昆钧_ZH": 91,
133
+ "欧菲妮_ZH": 92,
134
+ "塞琉斯_ZH": 93,
135
+ "仆人_ZH": 94,
136
+ "迈勒斯_ZH": 95,
137
+ "希格雯_ZH": 96,
138
+ "阿守_ZH": 97,
139
+ "拉赫曼_ZH": 98,
140
+ "杜拉夫_ZH": 99,
141
+ "伊利亚斯_ZH": 100,
142
+ "阿晃_ZH": 101,
143
+ "旁白_ZH": 102,
144
+ "爱德琳_ZH": 103,
145
+ "埃洛伊_ZH": 104,
146
+ "德沃沙克_ZH": 105,
147
+ "玛乔丽_ZH": 106,
148
+ "塞塔蕾_ZH": 107,
149
+ "柊千里_ZH": 108,
150
+ "海芭夏_ZH": 109,
151
+ "九条镰治_ZH": 110,
152
+ "阿娜耶_ZH": 111,
153
+ "笼钓瓶一心_ZH": 112,
154
+ "回声海螺_ZH": 113,
155
+ "劳维克_ZH": 114,
156
+ "元太_ZH": 115,
157
+ "阿扎尔_ZH": 116,
158
+ "查尔斯_ZH": 117,
159
+ "阿洛瓦_ZH": 118,
160
+ "埃勒曼_ZH": 119,
161
+ "纳比尔_ZH": 120,
162
+ "莎拉_ZH": 121,
163
+ "康纳_ZH": 122,
164
+ "博来_ZH": 123,
165
+ "玛塞勒_ZH": 124,
166
+ "阿祇_ZH": 125,
167
+ "博士_ZH": 126,
168
+ "玛格丽特_ZH": 127,
169
+ "迪尔菲_ZH": 128,
170
+ "宛烟_ZH": 129,
171
+ "羽生田千鹤_ZH": 130,
172
+ "海妮耶_ZH": 131,
173
+ "旅行者_ZH": 132,
174
+ "霍夫曼_ZH": 133,
175
+ "佐西摩斯_ZH": 134,
176
+ "鹿野奈奈_ZH": 135,
177
+ "舒伯特_ZH": 136,
178
+ "天叔_ZH": 137,
179
+ "艾莉丝_ZH": 138,
180
+ "龙二_ZH": 139,
181
+ "莺儿_ZH": 140,
182
+ "嘉良_ZH": 141,
183
+ "一心传名刀_ZH": 142,
184
+ "费迪南德_ZH": 143,
185
+ "珊瑚_ZH": 144,
186
+ "言笑_ZH": 145,
187
+ "久利须_ZH": 146,
188
+ "嘉玛_ZH": 147,
189
+ "艾文_ZH": 148,
190
+ "克洛琳德_ZH": 149,
191
+ "丹吉尔_ZH": 150,
192
+ "女士_ZH": 151,
193
+ "白老先生_ZH": 152,
194
+ "天目十五_ZH": 153,
195
+ "老孟_ZH": 154,
196
+ "巴达维_ZH": 155,
197
+ "长生_ZH": 156,
198
+ "吴船长_ZH": 157,
199
+ "拉齐_ZH": 158,
200
+ "艾伯特_ZH": 159,
201
+ "松浦_ZH": 160,
202
+ "埃泽_ZH": 161,
203
+ "阿圆_ZH": 162,
204
+ "莫塞伊思_ZH": 163,
205
+ "阿拉夫_ZH": 164,
206
+ "杜吉耶_ZH": 165,
207
+ "石头_ZH": 166,
208
+ "百闻_ZH": 167,
209
+ "波洛_ZH": 168,
210
+ "斯坦利_ZH": 169,
211
+ "博易_ZH": 170,
212
+ "迈蒙_ZH": 171,
213
+ "掇星攫辰天君_ZH": 172,
214
+ "毗伽尔_ZH": 173,
215
+ "芙卡洛斯_ZH": 174,
216
+ "恶龙_ZH": 175,
217
+ "恕筠_ZH": 176,
218
+ "知易_ZH": 177,
219
+ "克列门特_ZH": 178,
220
+ "大慈树王_ZH": 179,
221
+ "西拉杰_ZH": 180,
222
+ "上杉_ZH": 181,
223
+ "阿尔卡米_ZH": 182,
224
+ "纯水精灵_ZH": 183,
225
+ "常九爷_ZH": 184,
226
+ "沙扎曼_ZH": 185,
227
+ "田铁嘴_ZH": 186,
228
+ "克罗索_ZH": 187,
229
+ "阿巴图伊_ZH": 188,
230
+ "悦_ZH": 189,
231
+ "阿佩普_ZH": 190,
232
+ "埃尔欣根_ZH": 191,
233
+ "萨赫哈蒂_ZH": 192,
234
+ "塔杰·拉德卡尼_ZH": 193,
235
+ "安西_ZH": 194,
236
+ "埃舍尔_ZH": 195,
237
+ "萨齐因_ZH": 196,
238
+ "派蒙_JP": 197,
239
+ "纳西妲_JP": 198,
240
+ "凯亚_JP": 199,
241
+ "阿贝多_JP": 200,
242
+ "温迪_JP": 201,
243
+ "枫原万叶_JP": 202,
244
+ "钟离_JP": 203,
245
+ "荒泷一斗_JP": 204,
246
+ "八重神子_JP": 205,
247
+ "艾尔海森_JP": 206,
248
+ "提纳里_JP": 207,
249
+ "迪希雅_JP": 208,
250
+ "卡维_JP": 209,
251
+ "宵宫_JP": 210,
252
+ "那维莱特_JP": 211,
253
+ "莱依拉_JP": 212,
254
+ "赛诺_JP": 213,
255
+ "莫娜_JP": 214,
256
+ "诺艾尔_JP": 215,
257
+ "托马_JP": 216,
258
+ "凝光_JP": 217,
259
+ "林尼_JP": 218,
260
+ "北斗_JP": 219,
261
+ "柯莱_JP": 220,
262
+ "神里绫华_JP": 221,
263
+ "可莉_JP": 222,
264
+ "芭芭拉_JP": 223,
265
+ "雷电将军_JP": 224,
266
+ "娜维娅_JP": 225,
267
+ "芙宁娜_JP": 226,
268
+ "珊瑚宫心海_JP": 227,
269
+ "鹿野院平藏_JP": 228,
270
+ "迪奥娜_JP": 229,
271
+ "琴_JP": 230,
272
+ "五郎_JP": 231,
273
+ "班尼特_JP": 232,
274
+ "达达利亚_JP": 233,
275
+ "安柏_JP": 234,
276
+ "莱欧斯利_JP": 235,
277
+ "夜兰_JP": 236,
278
+ "妮露_JP": 237,
279
+ "辛焱_JP": 238,
280
+ "丽莎_JP": 239,
281
+ "珐露珊_JP": 240,
282
+ "魈_JP": 241,
283
+ "香菱_JP": 242,
284
+ "迪卢克_JP": 243,
285
+ "砂糖_JP": 244,
286
+ "烟绯_JP": 245,
287
+ "早柚_JP": 246,
288
+ "云堇_JP": 247,
289
+ "刻晴_JP": 248,
290
+ "重云_JP": 249,
291
+ "优菈_JP": 250,
292
+ "胡桃_JP": 251,
293
+ "流浪者_JP": 252,
294
+ "久岐忍_JP": 253,
295
+ "神里绫人_JP": 254,
296
+ "甘雨_JP": 255,
297
+ "戴因斯雷布_JP": 256,
298
+ "菲谢尔_JP": 257,
299
+ "白术_JP": 258,
300
+ "行秋_JP": 259,
301
+ "九条裟罗_JP": 260,
302
+ "夏洛蒂_JP": 261,
303
+ "雷泽_JP": 262,
304
+ "申鹤_JP": 263,
305
+ "空_JP": 264,
306
+ "荧_JP": 265,
307
+ "迪娜泽黛_JP": 266,
308
+ "凯瑟琳_JP": 267,
309
+ "多莉_JP": 268,
310
+ "坎蒂丝_JP": 269,
311
+ "琳妮特_JP": 270,
312
+ "萍姥姥_JP": 271,
313
+ "罗莎莉亚_JP": 272,
314
+ "埃德_JP": 273,
315
+ "爱贝尔_JP": 274,
316
+ "伊迪娅_JP": 275,
317
+ "留云借风真君_JP": 276,
318
+ "绮良良_JP": 277,
319
+ "七七_JP": 278,
320
+ "式大将_JP": 279,
321
+ "瑶瑶_JP": 280,
322
+ "奥兹_JP": 281,
323
+ "菲米尼_JP": 282,
324
+ "米卡_JP": 283,
325
+ "哲平_JP": 284,
326
+ "大肉丸_JP": 285,
327
+ "托克_JP": 286,
328
+ "蒂玛乌斯_JP": 287,
329
+ "昆钧_JP": 288,
330
+ "欧菲妮_JP": 289,
331
+ "塞琉斯_JP": 290,
332
+ "仆人_JP": 291,
333
+ "迈勒斯_JP": 292,
334
+ "希格雯_JP": 293,
335
+ "阿守_JP": 294,
336
+ "拉赫曼_JP": 295,
337
+ "杜拉夫_JP": 296,
338
+ "伊利亚斯_JP": 297,
339
+ "阿晃_JP": 298,
340
+ "旁白_JP": 299,
341
+ "爱德琳_JP": 300,
342
+ "埃洛伊_JP": 301,
343
+ "德沃沙克_JP": 302,
344
+ "玛乔丽_JP": 303,
345
+ "塞塔蕾_JP": 304,
346
+ "柊千里_JP": 305,
347
+ "海芭夏_JP": 306,
348
+ "九条镰治_JP": 307,
349
+ "阿娜耶_JP": 308,
350
+ "笼钓瓶一心_JP": 309,
351
+ "回声海螺_JP": 310,
352
+ "劳维克_JP": 311,
353
+ "元太_JP": 312,
354
+ "阿扎尔_JP": 313,
355
+ "查尔斯_JP": 314,
356
+ "阿洛瓦_JP": 315,
357
+ "埃勒曼_JP": 316,
358
+ "纳比尔_JP": 317,
359
+ "莎拉_JP": 318,
360
+ "康纳_JP": 319,
361
+ "博来_JP": 320,
362
+ "玛塞勒_JP": 321,
363
+ "阿祇_JP": 322,
364
+ "博士_JP": 323,
365
+ "迪尔菲_JP": 324,
366
+ "玛格丽特_JP": 325,
367
+ "宛烟_JP": 326,
368
+ "羽生田千鹤_JP": 327,
369
+ "海妮耶_JP": 328,
370
+ "霍夫曼_JP": 329,
371
+ "旅行者_JP": 330,
372
+ "佐西摩斯_JP": 331,
373
+ "舒伯特_JP": 332,
374
+ "鹿野奈奈_JP": 333,
375
+ "天叔_JP": 334,
376
+ "龙二_JP": 335,
377
+ "艾莉丝_JP": 336,
378
+ "莺儿_JP": 337,
379
+ "嘉良_JP": 338,
380
+ "珊瑚_JP": 339,
381
+ "言笑_JP": 340,
382
+ "一心传名刀_JP": 341,
383
+ "费迪南德_JP": 342,
384
+ "久利须_JP": 343,
385
+ "嘉玛_JP": 344,
386
+ "艾文_JP": 345,
387
+ "克洛琳德_JP": 346,
388
+ "丹吉尔_JP": 347,
389
+ "天目十五_JP": 348,
390
+ "女士_JP": 349,
391
+ "老孟_JP": 350,
392
+ "白老先生_JP": 351,
393
+ "舍利夫_JP": 352,
394
+ "巴达维_JP": 353,
395
+ "拉齐_JP": 354,
396
+ "长生_JP": 355,
397
+ "吴船长_JP": 356,
398
+ "艾伯特_JP": 357,
399
+ "松浦_JP": 358,
400
+ "埃泽_JP": 359,
401
+ "阿圆_JP": 360,
402
+ "阿拉夫_JP": 361,
403
+ "莫塞伊思_JP": 362,
404
+ "石头_JP": 363,
405
+ "百闻_JP": 364,
406
+ "杜吉耶_JP": 365,
407
+ "波洛_JP": 366,
408
+ "掇星攫辰天君_JP": 367,
409
+ "迈蒙_JP": 368,
410
+ "博易_JP": 369,
411
+ "诗筠_JP": 370,
412
+ "斯坦利_JP": 371,
413
+ "毗伽尔_JP": 372,
414
+ "芙卡洛斯_JP": 373,
415
+ "恶龙_JP": 374,
416
+ "小仓澪_JP": 375,
417
+ "恕筠_JP": 376,
418
+ "知易_JP": 377,
419
+ "克列门特_JP": 378,
420
+ "大慈树王_JP": 379,
421
+ "望雅_JP": 380,
422
+ "黑田_JP": 381,
423
+ "卡莉娜_JP": 382,
424
+ "马姆杜_JP": 383,
425
+ "科林斯_JP": 384,
426
+ "上杉_JP": 385,
427
+ "西拉杰_JP": 386,
428
+ "菲尔戈黛特_JP": 387,
429
+ "一平_JP": 388,
430
+ "纯水精灵_JP": 389,
431
+ "阿尔卡米_JP": 390,
432
+ "老戴_JP": 391,
433
+ "谢赫祖拜尔_JP": 392,
434
+ "沙扎曼_JP": 393,
435
+ "田铁嘴_JP": 394,
436
+ "小野寺_JP": 395,
437
+ "百识_JP": 396,
438
+ "克罗索_JP": 397,
439
+ "莱斯格_JP": 398,
440
+ "芷巧_JP": 399,
441
+ "加藤洋平_JP": 400,
442
+ "阿巴图伊_JP": 401,
443
+ "埃尔欣根_JP": 402,
444
+ "斯嘉莉_JP": 403,
445
+ "阿佩普_JP": 404,
446
+ "巫女_JP": 405,
447
+ "卡布斯_JP": 406,
448
+ "洛伦佐_JP": 407,
449
+ "萨赫哈蒂_JP": 408,
450
+ "娜德瓦_JP": 409,
451
+ "塞德娜_JP": 410,
452
+ "塔杰·拉德卡尼_JP": 411,
453
+ "绘星_JP": 412,
454
+ "泽田_JP": 413,
455
+ "安西_JP": 414,
456
+ "拉伊德_JP": 415,
457
+ "亚卡巴_JP": 416,
458
+ "有乐斋_JP": 417,
459
+ "莱昂_JP": 418,
460
+ "尤苏波夫_JP": 419,
461
+ "夏妮_JP": 420,
462
+ "埃舍尔_JP": 421,
463
+ "萨齐因_JP": 422,
464
+ "古山_JP": 423,
465
+ "自称渊上之物_JP": 424,
466
+ "丹羽_JP": 425,
467
+ "塞萨尔的日记_JP": 426,
468
+ "派蒙_EN": 427,
469
+ "纳西妲_EN": 428,
470
+ "凯亚_EN": 429,
471
+ "阿贝多_EN": 430,
472
+ "温迪_EN": 431,
473
+ "枫原万叶_EN": 432,
474
+ "钟离_EN": 433,
475
+ "荒泷一斗_EN": 434,
476
+ "八重神子_EN": 435,
477
+ "艾尔海森_EN": 436,
478
+ "提纳里_EN": 437,
479
+ "迪希雅_EN": 438,
480
+ "卡维_EN": 439,
481
+ "宵宫_EN": 440,
482
+ "莱依拉_EN": 441,
483
+ "那维莱特_EN": 442,
484
+ "赛诺_EN": 443,
485
+ "莫娜_EN": 444,
486
+ "诺艾尔_EN": 445,
487
+ "托马_EN": 446,
488
+ "凝光_EN": 447,
489
+ "林尼_EN": 448,
490
+ "北斗_EN": 449,
491
+ "柯莱_EN": 450,
492
+ "神里绫华_EN": 451,
493
+ "可莉_EN": 452,
494
+ "芭芭拉_EN": 453,
495
+ "雷电将军_EN": 454,
496
+ "娜维娅_EN": 455,
497
+ "芙宁娜_EN": 456,
498
+ "珊瑚宫心海_EN": 457,
499
+ "鹿野院平藏_EN": 458,
500
+ "迪奥娜_EN": 459,
501
+ "五郎_EN": 460,
502
+ "琴_EN": 461,
503
+ "班尼特_EN": 462,
504
+ "达达利亚_EN": 463,
505
+ "安柏_EN": 464,
506
+ "莱欧斯利_EN": 465,
507
+ "夜兰_EN": 466,
508
+ "妮露_EN": 467,
509
+ "辛焱_EN": 468,
510
+ "珐露珊_EN": 469,
511
+ "丽莎_EN": 470,
512
+ "魈_EN": 471,
513
+ "香菱_EN": 472,
514
+ "迪卢克_EN": 473,
515
+ "砂糖_EN": 474,
516
+ "烟绯_EN": 475,
517
+ "早柚_EN": 476,
518
+ "云堇_EN": 477,
519
+ "刻晴_EN": 478,
520
+ "重云_EN": 479,
521
+ "优菈_EN": 480,
522
+ "胡桃_EN": 481,
523
+ "流浪者_EN": 482,
524
+ "久岐忍_EN": 483,
525
+ "神里绫人_EN": 484,
526
+ "甘雨_EN": 485,
527
+ "戴因斯雷布_EN": 486,
528
+ "菲谢尔_EN": 487,
529
+ "白术_EN": 488,
530
+ "行秋_EN": 489,
531
+ "九条裟罗_EN": 490,
532
+ "夏洛蒂_EN": 491,
533
+ "雷泽_EN": 492,
534
+ "申鹤_EN": 493,
535
+ "荧_EN": 494,
536
+ "空_EN": 495,
537
+ "迪娜泽黛_EN": 496,
538
+ "凯瑟琳_EN": 497,
539
+ "多莉_EN": 498,
540
+ "坎蒂丝_EN": 499,
541
+ "琳妮特_EN": 500,
542
+ "萍姥姥_EN": 501,
543
+ "罗莎莉亚_EN": 502,
544
+ "埃德_EN": 503,
545
+ "爱贝尔_EN": 504,
546
+ "伊迪娅_EN": 505,
547
+ "留云借风真君_EN": 506,
548
+ "绮良良_EN": 507,
549
+ "七七_EN": 508,
550
+ "式大将_EN": 509,
551
+ "瑶瑶_EN": 510,
552
+ "奥兹_EN": 511,
553
+ "菲米尼_EN": 512,
554
+ "米卡_EN": 513,
555
+ "哲平_EN": 514,
556
+ "大肉丸_EN": 515,
557
+ "托克_EN": 516,
558
+ "蒂玛乌斯_EN": 517,
559
+ "昆钧_EN": 518,
560
+ "欧菲妮_EN": 519,
561
+ "塞琉斯_EN": 520,
562
+ "仆人_EN": 521,
563
+ "迈勒斯_EN": 522,
564
+ "希格雯_EN": 523,
565
+ "阿守_EN": 524,
566
+ "拉赫曼_EN": 525,
567
+ "杜拉夫_EN": 526,
568
+ "伊利亚斯_EN": 527,
569
+ "阿晃_EN": 528,
570
+ "旁白_EN": 529,
571
+ "爱德琳_EN": 530,
572
+ "埃洛伊_EN": 531,
573
+ "德沃沙克_EN": 532,
574
+ "玛乔丽_EN": 533,
575
+ "塞塔蕾_EN": 534,
576
+ "柊千里_EN": 535,
577
+ "海芭夏_EN": 536,
578
+ "九条镰治_EN": 537,
579
+ "阿娜耶_EN": 538,
580
+ "笼钓瓶一心_EN": 539,
581
+ "回声海螺_EN": 540,
582
+ "劳维克_EN": 541,
583
+ "元太_EN": 542,
584
+ "阿扎尔_EN": 543,
585
+ "查尔斯_EN": 544,
586
+ "阿洛瓦_EN": 545,
587
+ "埃勒曼_EN": 546,
588
+ "纳比尔_EN": 547,
589
+ "莎拉_EN": 548,
590
+ "康纳_EN": 549,
591
+ "博来_EN": 550,
592
+ "玛塞勒_EN": 551,
593
+ "阿祇_EN": 552,
594
+ "博士_EN": 553,
595
+ "迪尔菲_EN": 554,
596
+ "宛烟_EN": 555,
597
+ "玛格丽特_EN": 556,
598
+ "羽生田千鹤_EN": 557,
599
+ "海妮耶_EN": 558,
600
+ "霍夫曼_EN": 559,
601
+ "旅行者_EN": 560,
602
+ "佐西摩斯_EN": 561,
603
+ "鹿野奈奈_EN": 562,
604
+ "舒伯特_EN": 563,
605
+ "天叔_EN": 564,
606
+ "艾莉丝_EN": 565,
607
+ "龙二_EN": 566,
608
+ "莺儿_EN": 567,
609
+ "嘉良_EN": 568,
610
+ "珊瑚_EN": 569,
611
+ "费迪南德_EN": 570,
612
+ "言笑_EN": 571,
613
+ "一心传名刀_EN": 572,
614
+ "久利须_EN": 573,
615
+ "嘉玛_EN": 574,
616
+ "艾文_EN": 575,
617
+ "克洛琳德_EN": 576,
618
+ "丹吉尔_EN": 577,
619
+ "女士_EN": 578,
620
+ "天目十五_EN": 579,
621
+ "老孟_EN": 580,
622
+ "白老先生_EN": 581,
623
+ "舍利夫_EN": 582,
624
+ "巴达维_EN": 583,
625
+ "拉齐_EN": 584,
626
+ "长生_EN": 585,
627
+ "吴船长_EN": 586,
628
+ "艾伯特_EN": 587,
629
+ "松浦_EN": 588,
630
+ "埃泽_EN": 589,
631
+ "阿圆_EN": 590,
632
+ "阿拉夫_EN": 591,
633
+ "莫塞伊思_EN": 592,
634
+ "石头_EN": 593,
635
+ "百闻_EN": 594,
636
+ "杜吉耶_EN": 595,
637
+ "波洛_EN": 596,
638
+ "斯坦利_EN": 597,
639
+ "掇星攫辰天君_EN": 598,
640
+ "迈蒙_EN": 599,
641
+ "博易_EN": 600,
642
+ "诗筠_EN": 601,
643
+ "毗伽尔_EN": 602,
644
+ "慧心_EN": 603,
645
+ "芙卡洛斯_EN": 604,
646
+ "恶龙_EN": 605,
647
+ "小仓澪_EN": 606,
648
+ "恕筠_EN": 607,
649
+ "知易_EN": 608,
650
+ "克列门特_EN": 609,
651
+ "大慈树王_EN": 610,
652
+ "维多利亚_EN": 611,
653
+ "黑田_EN": 612,
654
+ "马姆杜_EN": 613,
655
+ "科林斯_EN": 614,
656
+ "上杉_EN": 615,
657
+ "西拉杰_EN": 616,
658
+ "宁禄_EN": 617,
659
+ "纯水精灵_EN": 618,
660
+ "常九爷_EN": 619,
661
+ "阿尔卡米_EN": 620,
662
+ "沙扎曼_EN": 621,
663
+ "田铁嘴_EN": 622,
664
+ "加萨尼_EN": 623,
665
+ "克罗索_EN": 624,
666
+ "星稀_EN": 625,
667
+ "莱斯格_EN": 626,
668
+ "阿巴图伊_EN": 627,
669
+ "悦_EN": 628,
670
+ "德田_EN": 629,
671
+ "埃尔欣根_EN": 630,
672
+ "阿佩普_EN": 631,
673
+ "萨赫哈蒂_EN": 632,
674
+ "洛伦佐_EN": 633,
675
+ "塔杰·拉德卡尼_EN": 634,
676
+ "泽田_EN": 635,
677
+ "安西_EN": 636,
678
+ "理水叠山真君_EN": 637,
679
+ "埃舍尔_EN": 638,
680
+ "萨齐因_EN": 639,
681
+ "古田_EN": 640,
682
+ "三月七_ZH": 641,
683
+ "丹恒_ZH": 642,
684
+ "希儿_ZH": 643,
685
+ "娜塔莎_ZH": 644,
686
+ "希露瓦_ZH": 645,
687
+ "瓦尔特_ZH": 646,
688
+ "佩拉_ZH": 647,
689
+ "布洛妮娅_ZH": 648,
690
+ "虎克_ZH": 649,
691
+ "素裳_ZH": 650,
692
+ "克拉拉_ZH": 651,
693
+ "符玄_ZH": 652,
694
+ "白露_ZH": 653,
695
+ "杰帕德_ZH": 654,
696
+ "景元_ZH": 655,
697
+ "藿藿_ZH": 656,
698
+ "姬子_ZH": 657,
699
+ "穹_ZH": 658,
700
+ "星_ZH": 659,
701
+ "卡芙卡_ZH": 660,
702
+ "桂乃芬_ZH": 661,
703
+ "艾丝妲_ZH": 662,
704
+ "玲可_ZH": 663,
705
+ "彦卿_ZH": 664,
706
+ "托帕_ZH": 665,
707
+ "驭空_ZH": 666,
708
+ "浮烟_ZH": 667,
709
+ "停云_ZH": 668,
710
+ "镜流_ZH": 669,
711
+ "罗刹_ZH": 670,
712
+ "卢卡_ZH": 671,
713
+ "史瓦罗_ZH": 672,
714
+ "黑塔_ZH": 673,
715
+ "桑博_ZH": 674,
716
+ "伦纳德_ZH": 675,
717
+ "明曦_ZH": 676,
718
+ "银狼_ZH": 677,
719
+ "帕姆_ZH": 678,
720
+ "青雀_ZH": 679,
721
+ "乔瓦尼_ZH": 680,
722
+ "公输师傅_ZH": 681,
723
+ "晴霓_ZH": 682,
724
+ "螺丝咕姆_ZH": 683,
725
+ "阿兰_ZH": 684,
726
+ "奥列格_ZH": 685,
727
+ "丹枢_ZH": 686,
728
+ "尾巴_ZH": 687,
729
+ "寒鸦_ZH": 688,
730
+ "雪衣_ZH": 689,
731
+ "可可利亚_ZH": 690,
732
+ "青镞_ZH": 691,
733
+ "半夏_ZH": 692,
734
+ "银枝_ZH": 693,
735
+ "大毫_ZH": 694,
736
+ "霄翰_ZH": 695,
737
+ "信使_ZH": 696,
738
+ "费斯曼_ZH": 697,
739
+ "绿芙蓉_ZH": 698,
740
+ "dev_成男_ZH": 699,
741
+ "金人会长_ZH": 700,
742
+ "维利特_ZH": 701,
743
+ "维尔德_ZH": 702,
744
+ "斯科特_ZH": 703,
745
+ "卡波特_ZH": 704,
746
+ "刃_ZH": 705,
747
+ "岩明_ZH": 706,
748
+ "浣溪_ZH": 707,
749
+ "三月七_JP": 708,
750
+ "丹恒_JP": 709,
751
+ "希儿_JP": 710,
752
+ "娜塔莎_JP": 711,
753
+ "希露瓦_JP": 712,
754
+ "瓦尔特_JP": 713,
755
+ "佩拉_JP": 714,
756
+ "布洛妮娅_JP": 715,
757
+ "虎克_JP": 716,
758
+ "素裳_JP": 717,
759
+ "克拉拉_JP": 718,
760
+ "符玄_JP": 719,
761
+ "白露_JP": 720,
762
+ "杰帕德_JP": 721,
763
+ "景元_JP": 722,
764
+ "藿藿_JP": 723,
765
+ "姬子_JP": 724,
766
+ "卡芙卡_JP": 725,
767
+ "穹_JP": 726,
768
+ "星_JP": 727,
769
+ "桂乃芬_JP": 728,
770
+ "艾丝妲_JP": 729,
771
+ "彦卿_JP": 730,
772
+ "玲可_JP": 731,
773
+ "托帕_JP": 732,
774
+ "驭空_JP": 733,
775
+ "浮烟_JP": 734,
776
+ "停云_JP": 735,
777
+ "镜流_JP": 736,
778
+ "罗刹_JP": 737,
779
+ "卢卡_JP": 738,
780
+ "史瓦罗_JP": 739,
781
+ "黑塔_JP": 740,
782
+ "桑博_JP": 741,
783
+ "伦纳德_JP": 742,
784
+ "明曦_JP": 743,
785
+ "银狼_JP": 744,
786
+ "帕姆_JP": 745,
787
+ "青雀_JP": 746,
788
+ "乔瓦尼_JP": 747,
789
+ "公输师傅_JP": 748,
790
+ "晴霓_JP": 749,
791
+ "螺丝咕姆_JP": 750,
792
+ "阿兰_JP": 751,
793
+ "奥列格_JP": 752,
794
+ "丹枢_JP": 753,
795
+ "尾巴_JP": 754,
796
+ "寒鸦_JP": 755,
797
+ "雪衣_JP": 756,
798
+ "可可利亚_JP": 757,
799
+ "青镞_JP": 758,
800
+ "半夏_JP": 759,
801
+ "银枝_JP": 760,
802
+ "大毫_JP": 761,
803
+ "霄翰_JP": 762,
804
+ "信使_JP": 763,
805
+ "费斯曼_JP": 764,
806
+ "绿芙蓉_JP": 765,
807
+ "dev_成男_JP": 766,
808
+ "金人会长_JP": 767,
809
+ "维利特_JP": 768,
810
+ "维尔德_JP": 769,
811
+ "斯科特_JP": 770,
812
+ "刃_JP": 771,
813
+ "卡波特_JP": 772,
814
+ "岩明_JP": 773,
815
+ "浣溪_JP": 774,
816
+ "净砚_JP": 775,
817
+ "紫月季_JP": 776,
818
+ "歌蒂_JP": 777,
819
+ "奇怪的云骑_JP": 778,
820
+ "幻胧_JP": 779,
821
+ "斯薇塔_JP": 780,
822
+ "隐书_JP": 781,
823
+ "三月七_EN": 782,
824
+ "丹恒_EN": 783,
825
+ "希儿_EN": 784,
826
+ "娜塔莎_EN": 785,
827
+ "希露瓦_EN": 786,
828
+ "瓦尔特_EN": 787,
829
+ "佩拉_EN": 788,
830
+ "布洛妮娅_EN": 789,
831
+ "虎克_EN": 790,
832
+ "素裳_EN": 791,
833
+ "克拉拉_EN": 792,
834
+ "符玄_EN": 793,
835
+ "白露_EN": 794,
836
+ "杰帕德_EN": 795,
837
+ "景元_EN": 796,
838
+ "藿藿_EN": 797,
839
+ "姬子_EN": 798,
840
+ "卡芙卡_EN": 799,
841
+ "穹_EN": 800,
842
+ "星_EN": 801,
843
+ "桂乃芬_EN": 802,
844
+ "艾丝妲_EN": 803,
845
+ "彦卿_EN": 804,
846
+ "玲可_EN": 805,
847
+ "托帕_EN": 806,
848
+ "驭空_EN": 807,
849
+ "浮烟_EN": 808,
850
+ "停云_EN": 809,
851
+ "镜流_EN": 810,
852
+ "罗刹_EN": 811,
853
+ "卢卡_EN": 812,
854
+ "史瓦罗_EN": 813,
855
+ "黑塔_EN": 814,
856
+ "桑博_EN": 815,
857
+ "伦纳德_EN": 816,
858
+ "明曦_EN": 817,
859
+ "银狼_EN": 818,
860
+ "帕姆_EN": 819,
861
+ "青雀_EN": 820,
862
+ "乔瓦尼_EN": 821,
863
+ "公输师傅_EN": 822,
864
+ "晴霓_EN": 823,
865
+ "螺丝咕姆_EN": 824,
866
+ "阿兰_EN": 825,
867
+ "奥列格_EN": 826,
868
+ "丹枢_EN": 827,
869
+ "尾巴_EN": 828,
870
+ "寒鸦_EN": 829,
871
+ "雪衣_EN": 830,
872
+ "可可利亚_EN": 831,
873
+ "青镞_EN": 832,
874
+ "半夏_EN": 833,
875
+ "银枝_EN": 834,
876
+ "大毫_EN": 835,
877
+ "霄翰_EN": 836,
878
+ "信使_EN": 837,
879
+ "费斯曼_EN": 838,
880
+ "绿芙蓉_EN": 839,
881
+ "dev_成男_EN": 840,
882
+ "金人会长_EN": 841,
883
+ "维利特_EN": 842,
884
+ "维尔德_EN": 843,
885
+ "刃_EN": 844,
886
+ "卡波特_EN": 845,
887
+ "岩明_EN": 846,
888
+ "浣溪_EN": 847,
889
+ "紫月季_EN": 848,
890
+ "幻胧_EN": 849,
891
+ "女声_EN": 850,
892
+ "陆景和": 851,
893
+ "莫弈": 852,
894
+ "左然": 853,
895
+ "夏彦": 854
896
  }
897
  },
898
  "model": {
 
949
  "use_spectral_norm": false,
950
  "gin_channels": 256
951
  },
952
+ "version": "2.2"
953
+ }
css/custom.css ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #yml_code {
3
+ height: 600px;
4
+ flex-grow: inherit;
5
+ overflow-y: auto;
6
+ }
7
+
8
+ #json_code {
9
+ height: 600px;
10
+ flex-grow: inherit;
11
+ overflow-y: auto;
12
+ }
13
+
14
+ #gpu_code {
15
+ height: 300px;
16
+ flex-grow: inherit;
17
+ overflow-y: auto;
18
+ }
data_utils.py CHANGED
@@ -3,11 +3,13 @@ import random
3
  import torch
4
  import torch.utils.data
5
  from tqdm import tqdm
 
6
  from tools.log import logger
7
  import commons
8
  from mel_processing import spectrogram_torch, mel_spectrogram_torch
9
  from utils import load_wav_to_torch, load_filepaths_and_text
10
  from text import cleaned_text_to_sequence
 
11
 
12
  """Multi speaker version"""
13
 
@@ -40,7 +42,11 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
40
 
41
  self.add_blank = hparams.add_blank
42
  self.min_text_len = getattr(hparams, "min_text_len", 1)
43
- self.max_text_len = getattr(hparams, "max_text_len", 300)
 
 
 
 
44
 
45
  random.seed(1234)
46
  random.shuffle(self.audiopaths_sid_text)
@@ -91,7 +97,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
91
 
92
  spec, wav = self.get_audio(audiopath)
93
  sid = torch.LongTensor([int(self.spk_map[sid])])
94
- return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert)
 
 
 
 
 
 
 
 
95
 
96
  def get_audio(self, filename):
97
  audio, sampling_rate = load_wav_to_torch(filename)
@@ -131,7 +145,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
131
  center=False,
132
  )
133
  spec = torch.squeeze(spec, 0)
134
- torch.save(spec, spec_filename)
 
135
  return spec, audio_norm
136
 
137
  def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
@@ -153,15 +168,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
153
 
154
  if language_str == "ZH":
155
  bert = bert_ori
156
- ja_bert = torch.zeros(1024, len(phone))
157
- en_bert = torch.zeros(1024, len(phone))
158
  elif language_str == "JP":
159
- bert = torch.zeros(1024, len(phone))
160
  ja_bert = bert_ori
161
- en_bert = torch.zeros(1024, len(phone))
162
  elif language_str == "EN":
163
- bert = torch.zeros(1024, len(phone))
164
- ja_bert = torch.zeros(1024, len(phone))
165
  en_bert = bert_ori
166
  phone = torch.LongTensor(phone)
167
  tone = torch.LongTensor(tone)
@@ -211,6 +226,7 @@ class TextAudioSpeakerCollate:
211
  bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
212
  ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
213
  en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
 
214
 
215
  spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
216
  wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
@@ -222,6 +238,7 @@ class TextAudioSpeakerCollate:
222
  bert_padded.zero_()
223
  ja_bert_padded.zero_()
224
  en_bert_padded.zero_()
 
225
 
226
  for i in range(len(ids_sorted_decreasing)):
227
  row = batch[ids_sorted_decreasing[i]]
@@ -255,6 +272,8 @@ class TextAudioSpeakerCollate:
255
  en_bert = row[8]
256
  en_bert_padded[i, :, : en_bert.size(1)] = en_bert
257
 
 
 
258
  return (
259
  text_padded,
260
  text_lengths,
@@ -268,6 +287,7 @@ class TextAudioSpeakerCollate:
268
  bert_padded,
269
  ja_bert_padded,
270
  en_bert_padded,
 
271
  )
272
 
273
 
 
3
  import torch
4
  import torch.utils.data
5
  from tqdm import tqdm
6
+ import numpy as np
7
  from tools.log import logger
8
  import commons
9
  from mel_processing import spectrogram_torch, mel_spectrogram_torch
10
  from utils import load_wav_to_torch, load_filepaths_and_text
11
  from text import cleaned_text_to_sequence
12
+ from config import config
13
 
14
  """Multi speaker version"""
15
 
 
42
 
43
  self.add_blank = hparams.add_blank
44
  self.min_text_len = getattr(hparams, "min_text_len", 1)
45
+ self.max_text_len = getattr(hparams, "max_text_len", 384)
46
+
47
+ self.empty_emo = torch.squeeze(
48
+ torch.load("empty_emo.npy", map_location="cpu"), dim=1
49
+ )
50
 
51
  random.seed(1234)
52
  random.shuffle(self.audiopaths_sid_text)
 
97
 
98
  spec, wav = self.get_audio(audiopath)
99
  sid = torch.LongTensor([int(self.spk_map[sid])])
100
+
101
+ if np.random.rand() > 0.1:
102
+ emo = torch.squeeze(
103
+ torch.load(audiopath.replace(".wav", ".emo.npy"), map_location="cpu"),
104
+ dim=1,
105
+ )
106
+ else:
107
+ emo = self.empty_emo
108
+ return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert, emo)
109
 
110
  def get_audio(self, filename):
111
  audio, sampling_rate = load_wav_to_torch(filename)
 
145
  center=False,
146
  )
147
  spec = torch.squeeze(spec, 0)
148
+ if config.train_ms_config.spec_cache:
149
+ torch.save(spec, spec_filename)
150
  return spec, audio_norm
151
 
152
  def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
 
168
 
169
  if language_str == "ZH":
170
  bert = bert_ori
171
+ ja_bert = torch.rand(1024, len(phone))
172
+ en_bert = torch.rand(1024, len(phone))
173
  elif language_str == "JP":
174
+ bert = torch.rand(1024, len(phone))
175
  ja_bert = bert_ori
176
+ en_bert = torch.rand(1024, len(phone))
177
  elif language_str == "EN":
178
+ bert = torch.rand(1024, len(phone))
179
+ ja_bert = torch.rand(1024, len(phone))
180
  en_bert = bert_ori
181
  phone = torch.LongTensor(phone)
182
  tone = torch.LongTensor(tone)
 
226
  bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
227
  ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
228
  en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
229
+ emo = torch.FloatTensor(len(batch), 512)
230
 
231
  spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
232
  wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
 
238
  bert_padded.zero_()
239
  ja_bert_padded.zero_()
240
  en_bert_padded.zero_()
241
+ emo.zero_()
242
 
243
  for i in range(len(ids_sorted_decreasing)):
244
  row = batch[ids_sorted_decreasing[i]]
 
272
  en_bert = row[8]
273
  en_bert_padded[i, :, : en_bert.size(1)] = en_bert
274
 
275
+ emo[i, :] = row[9]
276
+
277
  return (
278
  text_padded,
279
  text_lengths,
 
287
  bert_padded,
288
  ja_bert_padded,
289
  en_bert_padded,
290
+ emo,
291
  )
292
 
293
 
default_config.yml CHANGED
@@ -4,7 +4,7 @@
4
  # 拟提供通用路径配置,统一存放数据,避免数据放得很乱
5
  # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径
6
  # 不填或者填空则路径为相对于项目根目录的路径
7
- dataset_path: ""
8
 
9
  # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
10
  mirror: ""
@@ -17,16 +17,16 @@ resample:
17
  sampling_rate: 44100
18
  # 音频文件输入路径,重采样会将该路径下所有.wav音频文件重采样
19
  # 请填入相对于datasetPath的相对路径
20
- in_dir: "" # 相对于根目录的路径为 /datasetPath/in_dir
21
  # 音频文件重采样后输出路径
22
- out_dir: ""
23
 
24
 
25
  # preprocess_text 数据集预处理相关配置
26
  # 注意, “:” 后需要加空格
27
  preprocess_text:
28
  # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
29
- transcription_path: "filelists/bushroid.list"
30
  # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成
31
  cleaned_path: ""
32
  # 训练集路径
@@ -35,10 +35,10 @@ preprocess_text:
35
  val_path: "filelists/val.list"
36
  # 配置文件路径
37
  config_path: "config.json"
38
- # 每个speaker的验证集条数
39
- val_per_spk: 4
40
  # 验证集最大条数,多于的会被截断并放到训练集中
41
- max_val_total: 8
42
  # 是否进行数据清洗
43
  clean: true
44
 
@@ -49,35 +49,51 @@ bert_gen:
49
  # 训练数据集配置文件路径
50
  config_path: "config.json"
51
  # 并行数
52
- num_processes: 2
53
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
54
  # 该选项同时决定了get_bert_feature的默认设备
55
  device: "cuda"
56
  # 使用多卡推理
57
  use_multi_device: false
58
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # train 训练配置
61
  # 注意, “:” 后需要加空格
62
  train_ms:
63
- # 需要加载的环境变量,多显卡训练时RANK请手动在环境变量填写
64
- # 环境变量对应名称环境变量不存在时加载,也就是说手动添加的环境变量优先级更高,会覆盖本配置文件
65
  env:
66
  MASTER_ADDR: "localhost"
67
  MASTER_PORT: 10086
68
  WORLD_SIZE: 1
 
69
  RANK: 0
70
  # 可以填写任意名的环境变量
71
  # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
72
  # 底模设置
73
  base:
74
- use_base_model: True
75
  repo_id: "Stardust_minus/Bert-VITS2"
76
- model_image: "Bert-VITS2中日英底模-fix" # openi网页的模型名
77
  # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
78
  model: "models"
79
  # 配置文件路径
80
- config_path: "configs/config.json"
 
 
 
 
 
 
81
 
82
 
83
  # webui webui配置
@@ -86,9 +102,9 @@ webui:
86
  # 推理设备
87
  device: "cuda"
88
  # 模型路径
89
- model: "genshin/models/G_8000.pth"
90
  # 配置文件路径
91
- config_path: "configs/config.json"
92
  # 端口号
93
  port: 7860
94
  # 是否公开部署,对外网开放
@@ -99,7 +115,7 @@ webui:
99
  language_identification_library: "langid"
100
 
101
 
102
- # server api配置
103
  # 注意, “:” 后需要加空格
104
  # 注意,本配置下的所有配置均为相对于根目录的路径
105
  server:
@@ -107,8 +123,10 @@ server:
107
  port: 5000
108
  # 模型默认使用设备:但是当前并没有实现这个配置。
109
  device: "cuda"
110
- # 需要加载的所有模型的配置
 
111
  # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
 
112
  models:
113
  - # 模型的路径
114
  model: ""
@@ -149,7 +167,6 @@ server:
149
  # 不必填写所有人物,不填的使用默认值
150
  speakers: [ ] # 也可以不填
151
 
152
-
153
  # 百度翻译开放平台 api配置
154
  # api接入文档 https://api.fanyi.baidu.com/doc/21
155
  # 请不要在github等网站公开分享你的app id 与 key
 
4
  # 拟提供通用路径配置,统一存放数据,避免数据放得很乱
5
  # 每个数据集与其对应的模型存放至统一路径下,后续所有的路径配置均为相对于datasetPath的路径
6
  # 不填或者填空则路径为相对于项目根目录的路径
7
+ dataset_path: "Data/"
8
 
9
  # 模型镜像源,默认huggingface,使用openi镜像源需指定openi_token
10
  mirror: ""
 
17
  sampling_rate: 44100
18
  # 音频文件输入路径,重采样会将该路径下所有.wav音频文件重采样
19
  # 请填入相对于datasetPath的相对路径
20
+ in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir
21
  # 音频文件重采样后输出路径
22
+ out_dir: "audios/wavs"
23
 
24
 
25
  # preprocess_text 数据集预处理相关配置
26
  # 注意, “:” 后需要加空格
27
  preprocess_text:
28
  # 原始文本文件路径,文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
29
+ transcription_path: "filelists/你的数据集文本.list"
30
  # 数据清洗后文本路径,可以不填。不填则将在原始文本目录生成
31
  cleaned_path: ""
32
  # 训练集路径
 
35
  val_path: "filelists/val.list"
36
  # 配置文件路径
37
  config_path: "config.json"
38
+ # 每个语言的验证集条数
39
+ val_per_lang: 4
40
  # 验证集最大条数,多于的会被截断并放到训练集中
41
+ max_val_total: 12
42
  # 是否进行数据清洗
43
  clean: true
44
 
 
49
  # 训练数据集配置文件路径
50
  config_path: "config.json"
51
  # 并行数
52
+ num_processes: 4
53
  # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
54
  # 该选项同时决定了get_bert_feature的默认设备
55
  device: "cuda"
56
  # 使用多卡推理
57
  use_multi_device: false
58
 
59
+ # emo_gen 相关配置
60
+ # 注意, “:” 后需要加空格
61
+ emo_gen:
62
+ # 训练数据集配置文件路径
63
+ config_path: "config.json"
64
+ # 并行数
65
+ num_processes: 4
66
+ # 使用设备:可选项 "cuda" 显卡推理,"cpu" cpu推理
67
+ device: "cuda"
68
+ # 使用多卡推理
69
+ use_multi_device: false
70
 
71
  # train 训练配置
72
  # 注意, “:” 后需要加空格
73
  train_ms:
 
 
74
  env:
75
  MASTER_ADDR: "localhost"
76
  MASTER_PORT: 10086
77
  WORLD_SIZE: 1
78
+ LOCAL_RANK: 0
79
  RANK: 0
80
  # 可以填写任意名的环境变量
81
  # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
82
  # 底模设置
83
  base:
84
+ use_base_model: false
85
  repo_id: "Stardust_minus/Bert-VITS2"
86
+ model_image: "Bert-VITS2_2.2-Clap底模" # openi网页的模型名
87
  # 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
88
  model: "models"
89
  # 配置文件路径
90
+ config_path: "config.json"
91
+ # 训练使用的worker,不建议超过CPU核心数
92
+ num_workers: 16
93
+ # 关闭此项可以节约接近50%的磁盘空间,但是可能导致实际训练速度变慢和更高的CPU使用率。
94
+ spec_cache: True
95
+ # 保存的检查点数量,多于此数目的权重会被删除来节省空间。
96
+ keep_ckpts: 8
97
 
98
 
99
  # webui webui配置
 
102
  # 推理设备
103
  device: "cuda"
104
  # 模型路径
105
+ model: "models/G_8000.pth"
106
  # 配置文件路径
107
+ config_path: "config.json"
108
  # 端口号
109
  port: 7860
110
  # 是否公开部署,对外网开放
 
115
  language_identification_library: "langid"
116
 
117
 
118
+ # server-fastapi配置
119
  # 注意, “:” 后需要加空格
120
  # 注意,本配置下的所有配置均为相对于根目录的路径
121
  server:
 
123
  port: 5000
124
  # 模型默认使用设备:但是当前并没有实现这个配置。
125
  device: "cuda"
126
+ # 需要加载的所有模型的配置,可以填多个模型,也可以不填模型,等网页成功后手动加载模型
127
+ # 不加载模型的配置格式:删除默认给的两个模型配置,给models赋值 [ ],也就是空列表。参考模型2的speakers 即 models: [ ]
128
  # 注意,所有模型都必须正确配置model与config的路径,空路径会导致加载错误。
129
+ # 也可以不填模型,等网页加载成功后手动填写models。
130
  models:
131
  - # 模型的路径
132
  model: ""
 
167
  # 不必填写所有人物,不填的使用默认值
168
  speakers: [ ] # 也可以不填
169
 
 
170
  # 百度翻译开放平台 api配置
171
  # api接入文档 https://api.fanyi.baidu.com/doc/21
172
  # 请不要在github等网站公开分享你的app id 与 key
emotional/clap-htsat-fused/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
emotional/clap-htsat-fused/README.md ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ # Model card for CLAP
5
+
6
+ Model card for CLAP: Contrastive Language-Audio Pretraining
7
+
8
+ ![clap_image](https://s3.amazonaws.com/moonup/production/uploads/1678811100805-62441d1d9fdefb55a0b7d12c.png)
9
+
10
+
11
+ # Table of Contents
12
+
13
+ 0. [TL;DR](#TL;DR)
14
+ 1. [Model Details](#model-details)
15
+ 2. [Usage](#usage)
16
+ 3. [Uses](#uses)
17
+ 4. [Citation](#citation)
18
+
19
+ # TL;DR
20
+
21
+ The abstract of the paper states that:
22
+
23
+ > Contrastive learning has shown remarkable success in the field of multimodal representation learning. In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. In audio classification tasks, the model achieves state-of-the-art performance in the zero-shot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. LAION-Audio-630K and the proposed model are both available to the public.
24
+
25
+
26
+ # Usage
27
+
28
+ You can use this model for zero shot audio classification or extracting audio and/or textual features.
29
+
30
+ # Uses
31
+
32
+ ## Perform zero-shot audio classification
33
+
34
+ ### Using `pipeline`
35
+
36
+ ```python
37
+ from datasets import load_dataset
38
+ from transformers import pipeline
39
+
40
+ dataset = load_dataset("ashraq/esc50")
41
+ audio = dataset["train"]["audio"][-1]["array"]
42
+
43
+ audio_classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-fused")
44
+ output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
45
+ print(output)
46
+ >>> [{"score": 0.999, "label": "Sound of a dog"}, {"score": 0.001, "label": "Sound of vaccum cleaner"}]
47
+ ```
48
+
49
+ ## Run the model:
50
+
51
+ You can also get the audio and text embeddings using `ClapModel`
52
+
53
+ ### Run the model on CPU:
54
+
55
+ ```python
56
+ from datasets import load_dataset
57
+ from transformers import ClapModel, ClapProcessor
58
+
59
+ librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
60
+ audio_sample = librispeech_dummy[0]
61
+
62
+ model = ClapModel.from_pretrained("laion/clap-htsat-fused")
63
+ processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
64
+
65
+ inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt")
66
+ audio_embed = model.get_audio_features(**inputs)
67
+ ```
68
+
69
+ ### Run the model on GPU:
70
+
71
+ ```python
72
+ from datasets import load_dataset
73
+ from transformers import ClapModel, ClapProcessor
74
+
75
+ librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
76
+ audio_sample = librispeech_dummy[0]
77
+
78
+ model = ClapModel.from_pretrained("laion/clap-htsat-fused").to(0)
79
+ processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
80
+
81
+ inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt").to(0)
82
+ audio_embed = model.get_audio_features(**inputs)
83
+ ```
84
+
85
+
86
+ # Citation
87
+
88
+ If you are using this model for your work, please consider citing the original paper:
89
+ ```
90
+ @misc{https://doi.org/10.48550/arxiv.2211.06687,
91
+ doi = {10.48550/ARXIV.2211.06687},
92
+
93
+ url = {https://arxiv.org/abs/2211.06687},
94
+
95
+ author = {Wu, Yusong and Chen, Ke and Zhang, Tianyu and Hui, Yuchen and Berg-Kirkpatrick, Taylor and Dubnov, Shlomo},
96
+
97
+ keywords = {Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering},
98
+
99
+ title = {Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
100
+
101
+ publisher = {arXiv},
102
+
103
+ year = {2022},
104
+
105
+ copyright = {Creative Commons Attribution 4.0 International}
106
+ }
107
+ ```
emotional/clap-htsat-fused/config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "architectures": [
4
+ "ClapModel"
5
+ ],
6
+ "audio_config": {
7
+ "_name_or_path": "",
8
+ "add_cross_attention": false,
9
+ "aff_block_r": 4,
10
+ "architectures": null,
11
+ "attention_probs_dropout_prob": 0.0,
12
+ "bad_words_ids": null,
13
+ "begin_suppress_tokens": null,
14
+ "bos_token_id": null,
15
+ "chunk_size_feed_forward": 0,
16
+ "cross_attention_hidden_size": null,
17
+ "decoder_start_token_id": null,
18
+ "depths": [
19
+ 2,
20
+ 2,
21
+ 6,
22
+ 2
23
+ ],
24
+ "diversity_penalty": 0.0,
25
+ "do_sample": false,
26
+ "drop_path_rate": 0.0,
27
+ "early_stopping": false,
28
+ "enable_fusion": true,
29
+ "enable_patch_fusion": true,
30
+ "enable_patch_layer_norm": true,
31
+ "encoder_no_repeat_ngram_size": 0,
32
+ "eos_token_id": null,
33
+ "exponential_decay_length_penalty": null,
34
+ "finetuning_task": null,
35
+ "flatten_patch_embeds": true,
36
+ "forced_bos_token_id": null,
37
+ "forced_eos_token_id": null,
38
+ "fusion_num_hidden_layers": 2,
39
+ "fusion_type": null,
40
+ "hidden_act": "gelu",
41
+ "hidden_dropout_prob": 0.1,
42
+ "hidden_size": 768,
43
+ "id2label": {
44
+ "0": "LABEL_0",
45
+ "1": "LABEL_1"
46
+ },
47
+ "initializer_factor": 1.0,
48
+ "is_decoder": false,
49
+ "is_encoder_decoder": false,
50
+ "label2id": {
51
+ "LABEL_0": 0,
52
+ "LABEL_1": 1
53
+ },
54
+ "layer_norm_eps": 1e-05,
55
+ "length_penalty": 1.0,
56
+ "max_length": 20,
57
+ "min_length": 0,
58
+ "mlp_ratio": 4.0,
59
+ "model_type": "clap_audio_model",
60
+ "no_repeat_ngram_size": 0,
61
+ "num_attention_heads": [
62
+ 4,
63
+ 8,
64
+ 16,
65
+ 32
66
+ ],
67
+ "num_beam_groups": 1,
68
+ "num_beams": 1,
69
+ "num_classes": 527,
70
+ "num_hidden_layers": 4,
71
+ "num_mel_bins": 64,
72
+ "num_return_sequences": 1,
73
+ "output_attentions": false,
74
+ "output_hidden_states": false,
75
+ "output_scores": false,
76
+ "pad_token_id": null,
77
+ "patch_embed_input_channels": 1,
78
+ "patch_embeds_hidden_size": 96,
79
+ "patch_size": 4,
80
+ "patch_stride": [
81
+ 4,
82
+ 4
83
+ ],
84
+ "prefix": null,
85
+ "problem_type": null,
86
+ "projection_dim": 512,
87
+ "projection_hidden_act": "relu",
88
+ "projection_hidden_size": 768,
89
+ "pruned_heads": {},
90
+ "qkv_bias": true,
91
+ "remove_invalid_values": false,
92
+ "repetition_penalty": 1.0,
93
+ "return_dict": true,
94
+ "return_dict_in_generate": false,
95
+ "sep_token_id": null,
96
+ "spec_size": 256,
97
+ "suppress_tokens": null,
98
+ "task_specific_params": null,
99
+ "temperature": 1.0,
100
+ "tf_legacy_loss": false,
101
+ "tie_encoder_decoder": false,
102
+ "tie_word_embeddings": true,
103
+ "tokenizer_class": null,
104
+ "top_k": 50,
105
+ "top_p": 1.0,
106
+ "torch_dtype": null,
107
+ "torchscript": false,
108
+ "transformers_version": "4.27.0.dev0",
109
+ "typical_p": 1.0,
110
+ "use_bfloat16": false,
111
+ "window_size": 8
112
+ },
113
+ "hidden_size": 768,
114
+ "initializer_factor": 1.0,
115
+ "logit_scale_init_value": 14.285714285714285,
116
+ "model_type": "clap",
117
+ "num_hidden_layers": 16,
118
+ "projection_dim": 512,
119
+ "projection_hidden_act": "relu",
120
+ "text_config": {
121
+ "_name_or_path": "",
122
+ "add_cross_attention": false,
123
+ "architectures": null,
124
+ "attention_probs_dropout_prob": 0.1,
125
+ "bad_words_ids": null,
126
+ "begin_suppress_tokens": null,
127
+ "bos_token_id": 0,
128
+ "chunk_size_feed_forward": 0,
129
+ "classifier_dropout": null,
130
+ "cross_attention_hidden_size": null,
131
+ "decoder_start_token_id": null,
132
+ "diversity_penalty": 0.0,
133
+ "do_sample": false,
134
+ "early_stopping": false,
135
+ "encoder_no_repeat_ngram_size": 0,
136
+ "eos_token_id": 2,
137
+ "exponential_decay_length_penalty": null,
138
+ "finetuning_task": null,
139
+ "forced_bos_token_id": null,
140
+ "forced_eos_token_id": null,
141
+ "fusion_hidden_size": 768,
142
+ "fusion_num_hidden_layers": 2,
143
+ "hidden_act": "gelu",
144
+ "hidden_dropout_prob": 0.1,
145
+ "hidden_size": 768,
146
+ "id2label": {
147
+ "0": "LABEL_0",
148
+ "1": "LABEL_1"
149
+ },
150
+ "initializer_factor": 1.0,
151
+ "initializer_range": 0.02,
152
+ "intermediate_size": 3072,
153
+ "is_decoder": false,
154
+ "is_encoder_decoder": false,
155
+ "label2id": {
156
+ "LABEL_0": 0,
157
+ "LABEL_1": 1
158
+ },
159
+ "layer_norm_eps": 1e-12,
160
+ "length_penalty": 1.0,
161
+ "max_length": 20,
162
+ "max_position_embeddings": 514,
163
+ "min_length": 0,
164
+ "model_type": "clap_text_model",
165
+ "no_repeat_ngram_size": 0,
166
+ "num_attention_heads": 12,
167
+ "num_beam_groups": 1,
168
+ "num_beams": 1,
169
+ "num_hidden_layers": 12,
170
+ "num_return_sequences": 1,
171
+ "output_attentions": false,
172
+ "output_hidden_states": false,
173
+ "output_scores": false,
174
+ "pad_token_id": 1,
175
+ "position_embedding_type": "absolute",
176
+ "prefix": null,
177
+ "problem_type": null,
178
+ "projection_dim": 512,
179
+ "projection_hidden_act": "relu",
180
+ "projection_hidden_size": 768,
181
+ "pruned_heads": {},
182
+ "remove_invalid_values": false,
183
+ "repetition_penalty": 1.0,
184
+ "return_dict": true,
185
+ "return_dict_in_generate": false,
186
+ "sep_token_id": null,
187
+ "suppress_tokens": null,
188
+ "task_specific_params": null,
189
+ "temperature": 1.0,
190
+ "tf_legacy_loss": false,
191
+ "tie_encoder_decoder": false,
192
+ "tie_word_embeddings": true,
193
+ "tokenizer_class": null,
194
+ "top_k": 50,
195
+ "top_p": 1.0,
196
+ "torch_dtype": null,
197
+ "torchscript": false,
198
+ "transformers_version": "4.27.0.dev0",
199
+ "type_vocab_size": 1,
200
+ "typical_p": 1.0,
201
+ "use_bfloat16": false,
202
+ "use_cache": true,
203
+ "vocab_size": 50265
204
+ },
205
+ "torch_dtype": "float32",
206
+ "transformers_version": null
207
+ }
emotional/clap-htsat-fused/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
emotional/clap-htsat-fused/preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length_s": 10,
3
+ "feature_extractor_type": "ClapFeatureExtractor",
4
+ "feature_size": 64,
5
+ "fft_window_size": 1024,
6
+ "frequency_max": 14000,
7
+ "frequency_min": 50,
8
+ "hop_length": 480,
9
+ "max_length_s": 10,
10
+ "n_fft": 1024,
11
+ "nb_frequency_bins": 513,
12
+ "nb_max_frames": 1000,
13
+ "nb_max_samples": 480000,
14
+ "padding": "repeatpad",
15
+ "padding_side": "right",
16
+ "padding_value": 0.0,
17
+ "processor_class": "ClapProcessor",
18
+ "return_attention_mask": false,
19
+ "sampling_rate": 48000,
20
+ "top_db": null,
21
+ "truncation": "fusion"
22
+ }
emotional/clap-htsat-fused/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ed5d0215d887551ddd0a49ce7311b21429ebdf1e6a129d4e68f743357225253
3
+ size 614596545
emotional/clap-htsat-fused/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
emotional/clap-htsat-fused/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
emotional/clap-htsat-fused/tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "errors": "replace",
7
+ "mask_token": "<mask>",
8
+ "model_max_length": 512,
9
+ "pad_token": "<pad>",
10
+ "processor_class": "ClapProcessor",
11
+ "sep_token": "</s>",
12
+ "special_tokens_map_file": null,
13
+ "tokenizer_class": "RobertaTokenizer",
14
+ "trim_offsets": true,
15
+ "unk_token": "<unk>"
16
+ }
emotional/clap-htsat-fused/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
empty_emo.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07063411ab7d6e7aacfc73c582616c3fbc8fdf518b20d42d8be77bc9caf6fab9
3
+ size 3238
export_onnx.py CHANGED
@@ -1,54 +1,10 @@
1
- from models_onnx import SynthesizerTrn
2
- import utils
3
- from text.symbols import symbols
4
  import os
5
- import json
6
-
7
-
8
- def export_onnx(export_path, model_path, config_path):
9
- hps = utils.get_hparams_from_file(config_path)
10
- net_g = SynthesizerTrn(
11
- len(symbols),
12
- hps.data.filter_length // 2 + 1,
13
- hps.train.segment_size // hps.data.hop_length,
14
- n_speakers=hps.data.n_speakers,
15
- **hps.model,
16
- )
17
- _ = net_g.eval()
18
- _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
19
- net_g.export_onnx(export_path)
20
-
21
- spklist = []
22
- for key in hps.data.spk2id.keys():
23
- spklist.append(key)
24
-
25
- MoeVSConf = {
26
- "Folder": f"{export_path}",
27
- "Name": f"{export_path}",
28
- "Type": "BertVits",
29
- "Symbol": symbols,
30
- "Cleaner": "",
31
- "Rate": hps.data.sampling_rate,
32
- "CharaMix": True,
33
- "Characters": spklist,
34
- "LanguageMap": {"ZH": [0, 0], "JP": [1, 6], "EN": [2, 8]},
35
- "Dict": "BasicDict",
36
- "BertPath": [
37
- "chinese-roberta-wwm-ext-large",
38
- "deberta-v2-large-japanese",
39
- "bert-base-japanese-v3",
40
- ],
41
- }
42
-
43
- with open(f"onnx/{export_path}.json", "w") as MoeVsConfFile:
44
- json.dump(MoeVSConf, MoeVsConfFile, indent=4)
45
-
46
 
47
  if __name__ == "__main__":
48
- print(symbols)
49
- export_path = "HimenoSena"
50
- model_path = "G_53000.pth"
51
- config_path = "config.json"
52
  if not os.path.exists("onnx"):
53
  os.makedirs("onnx")
54
  if not os.path.exists(f"onnx/{export_path}"):
 
1
+ from onnx_modules import export_onnx
 
 
2
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  if __name__ == "__main__":
5
+ export_path = "BertVits2.2PT"
6
+ model_path = "model\\G_0.pth"
7
+ config_path = "model\\config.json"
 
8
  if not os.path.exists("onnx"):
9
  os.makedirs("onnx")
10
  if not os.path.exists(f"onnx/{export_path}"):
filelists/sample.list ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Example:
2
+ {wav_path}|{speaker_name}|{language}|{text}
3
+ 派蒙_1.wav|派蒙|ZH|前面的区域,以后再来探索吧!
img/yuyu.png ADDED
img//345/217/202/346/225/260/350/257/264/346/230/216.png ADDED
img//345/256/265/345/256/253.png ADDED
img//345/276/256/344/277/241/345/233/276/347/211/207_20231010105112.png ADDED
img//347/245/236/351/207/214/347/273/253/345/215/216.png ADDED
img//347/272/263/350/245/277/345/246/262.png ADDED
infer.py CHANGED
@@ -5,17 +5,23 @@
5
  2. 请在模型的config.json中显示声明版本号,添加一个字段"version" : "你的版本号"
6
  特殊版本说明:
7
  1.1.1-fix: 1.1.1版本训练的模型,但是在推理时使用dev的日语修复
8
- 1.1.1-dev: dev开发
9
- 2.0:当前版本
10
  """
11
  import torch
12
  import commons
13
  from text import cleaned_text_to_sequence, get_bert
 
14
  from text.cleaner import clean_text
15
  import utils
 
16
 
17
  from models import SynthesizerTrn
18
  from text.symbols import symbols
 
 
 
 
 
19
  from oldVersion.V111.models import SynthesizerTrn as V111SynthesizerTrn
20
  from oldVersion.V111.text import symbols as V111symbols
21
  from oldVersion.V110.models import SynthesizerTrn as V110SynthesizerTrn
@@ -23,13 +29,17 @@ from oldVersion.V110.text import symbols as V110symbols
23
  from oldVersion.V101.models import SynthesizerTrn as V101SynthesizerTrn
24
  from oldVersion.V101.text import symbols as V101symbols
25
 
26
- from oldVersion import V111, V110, V101
27
 
28
  # 当前版本信息
29
- latest_version = "2.0"
30
 
31
  # 版本兼容
32
  SynthesizerTrnMap = {
 
 
 
 
33
  "1.1.1-fix": V111SynthesizerTrn,
34
  "1.1.1": V111SynthesizerTrn,
35
  "1.1": V110SynthesizerTrn,
@@ -40,6 +50,10 @@ SynthesizerTrnMap = {
40
  }
41
 
42
  symbolsMap = {
 
 
 
 
43
  "1.1.1-fix": V111symbols,
44
  "1.1.1": V111symbols,
45
  "1.1": V110symbols,
@@ -50,6 +64,17 @@ symbolsMap = {
50
  }
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
53
  def get_net_g(model_path: str, version: str, device: str, hps):
54
  if version != latest_version:
55
  net_g = SynthesizerTrnMap[version](
@@ -91,15 +116,15 @@ def get_text(text, language_str, hps, device):
91
 
92
  if language_str == "ZH":
93
  bert = bert_ori
94
- ja_bert = torch.zeros(1024, len(phone))
95
- en_bert = torch.zeros(1024, len(phone))
96
  elif language_str == "JP":
97
- bert = torch.zeros(1024, len(phone))
98
  ja_bert = bert_ori
99
- en_bert = torch.zeros(1024, len(phone))
100
  elif language_str == "EN":
101
- bert = torch.zeros(1024, len(phone))
102
- ja_bert = torch.zeros(1024, len(phone))
103
  en_bert = bert_ori
104
  else:
105
  raise ValueError("language_str should be ZH, JP or EN")
@@ -116,6 +141,7 @@ def get_text(text, language_str, hps, device):
116
 
117
  def infer(
118
  text,
 
119
  sdp_ratio,
120
  noise_scale,
121
  noise_scale_w,
@@ -125,9 +151,20 @@ def infer(
125
  hps,
126
  net_g,
127
  device,
 
 
 
128
  ):
129
- # 支持中日双语版本
 
 
 
 
 
130
  inferMap_V2 = {
 
 
 
131
  "1.1.1-fix": V111.infer_fix,
132
  "1.1.1": V111.infer,
133
  "1.1": V110.infer,
@@ -143,6 +180,23 @@ def infer(
143
  version = hps.version if hasattr(hps, "version") else latest_version
144
  # 非当前版本,根据版本号选择合适的infer
145
  if version != latest_version:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  if version in inferMap_V2.keys():
147
  return inferMap_V2[version](
148
  text,
@@ -169,9 +223,127 @@ def infer(
169
  device,
170
  )
171
  # 在此处实现当前版本的推理
 
 
 
 
 
 
 
172
  bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
173
  text, language, hps, device
174
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  with torch.no_grad():
176
  x_tst = phones.to(device).unsqueeze(0)
177
  tones = tones.to(device).unsqueeze(0)
@@ -179,6 +351,7 @@ def infer(
179
  bert = bert.to(device).unsqueeze(0)
180
  ja_bert = ja_bert.to(device).unsqueeze(0)
181
  en_bert = en_bert.to(device).unsqueeze(0)
 
182
  x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
183
  del phones
184
  speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
@@ -192,6 +365,7 @@ def infer(
192
  bert,
193
  ja_bert,
194
  en_bert,
 
195
  sdp_ratio=sdp_ratio,
196
  noise_scale=noise_scale,
197
  noise_scale_w=noise_scale_w,
@@ -201,7 +375,7 @@ def infer(
201
  .float()
202
  .numpy()
203
  )
204
- del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert
205
  if torch.cuda.is_available():
206
  torch.cuda.empty_cache()
207
  return audio
 
5
  2. 请在模型的config.json中显示声明版本号,添加一个字段"version" : "你的版本号"
6
  特殊版本说明:
7
  1.1.1-fix: 1.1.1版本训练的模型,但是在推理时使用dev的日语修复
8
+ 2.2:当前版本
 
9
  """
10
  import torch
11
  import commons
12
  from text import cleaned_text_to_sequence, get_bert
13
+ from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
14
  from text.cleaner import clean_text
15
  import utils
16
+ import numpy as np
17
 
18
  from models import SynthesizerTrn
19
  from text.symbols import symbols
20
+
21
+ from oldVersion.V210.models import SynthesizerTrn as V210SynthesizerTrn
22
+ from oldVersion.V210.text import symbols as V210symbols
23
+ from oldVersion.V200.models import SynthesizerTrn as V200SynthesizerTrn
24
+ from oldVersion.V200.text import symbols as V200symbols
25
  from oldVersion.V111.models import SynthesizerTrn as V111SynthesizerTrn
26
  from oldVersion.V111.text import symbols as V111symbols
27
  from oldVersion.V110.models import SynthesizerTrn as V110SynthesizerTrn
 
29
  from oldVersion.V101.models import SynthesizerTrn as V101SynthesizerTrn
30
  from oldVersion.V101.text import symbols as V101symbols
31
 
32
+ from oldVersion import V111, V110, V101, V200, V210
33
 
34
  # 当前版本信息
35
+ latest_version = "2.2"
36
 
37
  # 版本兼容
38
  SynthesizerTrnMap = {
39
+ "2.1": V210SynthesizerTrn,
40
+ "2.0.2-fix": V200SynthesizerTrn,
41
+ "2.0.1": V200SynthesizerTrn,
42
+ "2.0": V200SynthesizerTrn,
43
  "1.1.1-fix": V111SynthesizerTrn,
44
  "1.1.1": V111SynthesizerTrn,
45
  "1.1": V110SynthesizerTrn,
 
50
  }
51
 
52
  symbolsMap = {
53
+ "2.1": V210symbols,
54
+ "2.0.2-fix": V200symbols,
55
+ "2.0.1": V200symbols,
56
+ "2.0": V200symbols,
57
  "1.1.1-fix": V111symbols,
58
  "1.1.1": V111symbols,
59
  "1.1": V110symbols,
 
64
  }
65
 
66
 
67
+ # def get_emo_(reference_audio, emotion, sid):
68
+ # emo = (
69
+ # torch.from_numpy(get_emo(reference_audio))
70
+ # if reference_audio and emotion == -1
71
+ # else torch.FloatTensor(
72
+ # np.load(f"emo_clustering/{sid}/cluster_center_{emotion}.npy")
73
+ # )
74
+ # )
75
+ # return emo
76
+
77
+
78
  def get_net_g(model_path: str, version: str, device: str, hps):
79
  if version != latest_version:
80
  net_g = SynthesizerTrnMap[version](
 
116
 
117
  if language_str == "ZH":
118
  bert = bert_ori
119
+ ja_bert = torch.rand(1024, len(phone))
120
+ en_bert = torch.rand(1024, len(phone))
121
  elif language_str == "JP":
122
+ bert = torch.rand(1024, len(phone))
123
  ja_bert = bert_ori
124
+ en_bert = torch.rand(1024, len(phone))
125
  elif language_str == "EN":
126
+ bert = torch.rand(1024, len(phone))
127
+ ja_bert = torch.rand(1024, len(phone))
128
  en_bert = bert_ori
129
  else:
130
  raise ValueError("language_str should be ZH, JP or EN")
 
141
 
142
  def infer(
143
  text,
144
+ emotion,
145
  sdp_ratio,
146
  noise_scale,
147
  noise_scale_w,
 
151
  hps,
152
  net_g,
153
  device,
154
+ reference_audio=None,
155
+ skip_start=False,
156
+ skip_end=False,
157
  ):
158
+ # 2.2版本参数位置变了
159
+ # 2.1 参数新增 emotion reference_audio skip_start skip_end
160
+ inferMap_V3 = {
161
+ "2.1": V210.infer,
162
+ }
163
+ # 支持中日英三语版本
164
  inferMap_V2 = {
165
+ "2.0.2-fix": V200.infer,
166
+ "2.0.1": V200.infer,
167
+ "2.0": V200.infer,
168
  "1.1.1-fix": V111.infer_fix,
169
  "1.1.1": V111.infer,
170
  "1.1": V110.infer,
 
180
  version = hps.version if hasattr(hps, "version") else latest_version
181
  # 非当前版本,根据版本号选择合适的infer
182
  if version != latest_version:
183
+ if version in inferMap_V3.keys():
184
+ return inferMap_V3[version](
185
+ text,
186
+ sdp_ratio,
187
+ noise_scale,
188
+ noise_scale_w,
189
+ length_scale,
190
+ sid,
191
+ language,
192
+ hps,
193
+ net_g,
194
+ device,
195
+ reference_audio,
196
+ emotion,
197
+ skip_start,
198
+ skip_end,
199
+ )
200
  if version in inferMap_V2.keys():
201
  return inferMap_V2[version](
202
  text,
 
223
  device,
224
  )
225
  # 在此处实现当前版本的推理
226
+ # emo = get_emo_(reference_audio, emotion, sid)
227
+ if isinstance(reference_audio, np.ndarray):
228
+ emo = get_clap_audio_feature(reference_audio, device)
229
+ else:
230
+ emo = get_clap_text_feature(emotion, device)
231
+ emo = torch.squeeze(emo, dim=1)
232
+
233
  bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
234
  text, language, hps, device
235
  )
236
+ if skip_start:
237
+ phones = phones[3:]
238
+ tones = tones[3:]
239
+ lang_ids = lang_ids[3:]
240
+ bert = bert[:, 3:]
241
+ ja_bert = ja_bert[:, 3:]
242
+ en_bert = en_bert[:, 3:]
243
+ if skip_end:
244
+ phones = phones[:-2]
245
+ tones = tones[:-2]
246
+ lang_ids = lang_ids[:-2]
247
+ bert = bert[:, :-2]
248
+ ja_bert = ja_bert[:, :-2]
249
+ en_bert = en_bert[:, :-2]
250
+ with torch.no_grad():
251
+ x_tst = phones.to(device).unsqueeze(0)
252
+ tones = tones.to(device).unsqueeze(0)
253
+ lang_ids = lang_ids.to(device).unsqueeze(0)
254
+ bert = bert.to(device).unsqueeze(0)
255
+ ja_bert = ja_bert.to(device).unsqueeze(0)
256
+ en_bert = en_bert.to(device).unsqueeze(0)
257
+ x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
258
+ emo = emo.to(device).unsqueeze(0)
259
+ del phones
260
+ speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
261
+ audio = (
262
+ net_g.infer(
263
+ x_tst,
264
+ x_tst_lengths,
265
+ speakers,
266
+ tones,
267
+ lang_ids,
268
+ bert,
269
+ ja_bert,
270
+ en_bert,
271
+ emo,
272
+ sdp_ratio=sdp_ratio,
273
+ noise_scale=noise_scale,
274
+ noise_scale_w=noise_scale_w,
275
+ length_scale=length_scale,
276
+ )[0][0, 0]
277
+ .data.cpu()
278
+ .float()
279
+ .numpy()
280
+ )
281
+ del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
282
+ if torch.cuda.is_available():
283
+ torch.cuda.empty_cache()
284
+ return audio
285
+
286
+
287
+ def infer_multilang(
288
+ text,
289
+ sdp_ratio,
290
+ noise_scale,
291
+ noise_scale_w,
292
+ length_scale,
293
+ sid,
294
+ language,
295
+ hps,
296
+ net_g,
297
+ device,
298
+ reference_audio=None,
299
+ emotion=None,
300
+ skip_start=False,
301
+ skip_end=False,
302
+ ):
303
+ bert, ja_bert, en_bert, phones, tones, lang_ids = [], [], [], [], [], []
304
+ # emo = get_emo_(reference_audio, emotion, sid)
305
+ if isinstance(reference_audio, np.ndarray):
306
+ emo = get_clap_audio_feature(reference_audio, device)
307
+ else:
308
+ emo = get_clap_text_feature(emotion, device)
309
+ emo = torch.squeeze(emo, dim=1)
310
+ for idx, (txt, lang) in enumerate(zip(text, language)):
311
+ skip_start = (idx != 0) or (skip_start and idx == 0)
312
+ skip_end = (idx != len(text) - 1) or (skip_end and idx == len(text) - 1)
313
+ (
314
+ temp_bert,
315
+ temp_ja_bert,
316
+ temp_en_bert,
317
+ temp_phones,
318
+ temp_tones,
319
+ temp_lang_ids,
320
+ ) = get_text(txt, lang, hps, device)
321
+ if skip_start:
322
+ temp_bert = temp_bert[:, 3:]
323
+ temp_ja_bert = temp_ja_bert[:, 3:]
324
+ temp_en_bert = temp_en_bert[:, 3:]
325
+ temp_phones = temp_phones[3:]
326
+ temp_tones = temp_tones[3:]
327
+ temp_lang_ids = temp_lang_ids[3:]
328
+ if skip_end:
329
+ temp_bert = temp_bert[:, :-2]
330
+ temp_ja_bert = temp_ja_bert[:, :-2]
331
+ temp_en_bert = temp_en_bert[:, :-2]
332
+ temp_phones = temp_phones[:-2]
333
+ temp_tones = temp_tones[:-2]
334
+ temp_lang_ids = temp_lang_ids[:-2]
335
+ bert.append(temp_bert)
336
+ ja_bert.append(temp_ja_bert)
337
+ en_bert.append(temp_en_bert)
338
+ phones.append(temp_phones)
339
+ tones.append(temp_tones)
340
+ lang_ids.append(temp_lang_ids)
341
+ bert = torch.concatenate(bert, dim=1)
342
+ ja_bert = torch.concatenate(ja_bert, dim=1)
343
+ en_bert = torch.concatenate(en_bert, dim=1)
344
+ phones = torch.concatenate(phones, dim=0)
345
+ tones = torch.concatenate(tones, dim=0)
346
+ lang_ids = torch.concatenate(lang_ids, dim=0)
347
  with torch.no_grad():
348
  x_tst = phones.to(device).unsqueeze(0)
349
  tones = tones.to(device).unsqueeze(0)
 
351
  bert = bert.to(device).unsqueeze(0)
352
  ja_bert = ja_bert.to(device).unsqueeze(0)
353
  en_bert = en_bert.to(device).unsqueeze(0)
354
+ emo = emo.to(device).unsqueeze(0)
355
  x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
356
  del phones
357
  speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
 
365
  bert,
366
  ja_bert,
367
  en_bert,
368
+ emo,
369
  sdp_ratio=sdp_ratio,
370
  noise_scale=noise_scale,
371
  noise_scale_w=noise_scale_w,
 
375
  .float()
376
  .numpy()
377
  )
378
+ del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
379
  if torch.cuda.is_available():
380
  torch.cuda.empty_cache()
381
  return audio
models.py CHANGED
@@ -10,9 +10,12 @@ import monotonic_align
10
 
11
  from torch.nn import Conv1d, ConvTranspose1d, Conv2d
12
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 
13
  from commons import init_weights, get_padding
14
  from text import symbols, num_tones, num_languages
15
 
 
 
16
 
17
  class DurationDiscriminator(nn.Module): # vits2
18
  def __init__(
@@ -309,6 +312,37 @@ class DurationPredictor(nn.Module):
309
  return x * x_mask
310
 
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  class TextEncoder(nn.Module):
313
  def __init__(
314
  self,
@@ -320,6 +354,7 @@ class TextEncoder(nn.Module):
320
  n_layers,
321
  kernel_size,
322
  p_dropout,
 
323
  gin_channels=0,
324
  ):
325
  super().__init__()
@@ -341,6 +376,31 @@ class TextEncoder(nn.Module):
341
  self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
342
  self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
343
  self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
  self.encoder = attentions.Encoder(
346
  hidden_channels,
@@ -354,11 +414,17 @@ class TextEncoder(nn.Module):
354
  self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
355
 
356
  def forward(
357
- self, x, x_lengths, tone, language, bert, ja_bert, en_bert, sid, g=None
358
  ):
 
359
  bert_emb = self.bert_proj(bert).transpose(1, 2)
360
  ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2)
361
  en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
 
 
 
 
 
362
  x = (
363
  self.emb(x)
364
  + self.tone_emb(tone)
@@ -366,6 +432,7 @@ class TextEncoder(nn.Module):
366
  + bert_emb
367
  + ja_bert_emb
368
  + en_bert_emb
 
369
  ) * math.sqrt(
370
  self.hidden_channels
371
  ) # [b, t, h]
@@ -378,7 +445,7 @@ class TextEncoder(nn.Module):
378
  stats = self.proj(x) * x_mask
379
 
380
  m, logs = torch.split(stats, self.out_channels, dim=1)
381
- return x, m, logs, x_mask
382
 
383
 
384
  class ResidualCouplingBlock(nn.Module):
@@ -811,6 +878,7 @@ class SynthesizerTrn(nn.Module):
811
  n_layers,
812
  kernel_size,
813
  p_dropout,
 
814
  gin_channels=self.enc_gin_channels,
815
  )
816
  self.dec = Generator(
@@ -884,8 +952,8 @@ class SynthesizerTrn(nn.Module):
884
  g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
885
  else:
886
  g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
887
- x, m_p, logs_p, x_mask = self.enc_p(
888
- x, x_lengths, tone, language, bert, ja_bert, en_bert, sid, g=g
889
  )
890
  z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
891
  z_p = self.flow(z, y_mask, g=g)
@@ -951,6 +1019,8 @@ class SynthesizerTrn(nn.Module):
951
  y_mask,
952
  (z, z_p, m_p, logs_p, m_q, logs_q),
953
  (x, logw, logw_),
 
 
954
  )
955
 
956
  def infer(
@@ -963,6 +1033,7 @@ class SynthesizerTrn(nn.Module):
963
  bert,
964
  ja_bert,
965
  en_bert,
 
966
  noise_scale=0.667,
967
  length_scale=1,
968
  noise_scale_w=0.8,
@@ -976,8 +1047,8 @@ class SynthesizerTrn(nn.Module):
976
  g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
977
  else:
978
  g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
979
- x, m_p, logs_p, x_mask = self.enc_p(
980
- x, x_lengths, tone, language, bert, ja_bert, en_bert, sid, g=g
981
  )
982
  logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (
983
  sdp_ratio
 
10
 
11
  from torch.nn import Conv1d, ConvTranspose1d, Conv2d
12
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
+
14
  from commons import init_weights, get_padding
15
  from text import symbols, num_tones, num_languages
16
 
17
+ from vector_quantize_pytorch import VectorQuantize
18
+
19
 
20
  class DurationDiscriminator(nn.Module): # vits2
21
  def __init__(
 
312
  return x * x_mask
313
 
314
 
315
+ class Bottleneck(nn.Sequential):
316
+ def __init__(self, in_dim, hidden_dim):
317
+ c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
318
+ c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
319
+ super().__init__(*[c_fc1, c_fc2])
320
+
321
+
322
+ class Block(nn.Module):
323
+ def __init__(self, in_dim, hidden_dim) -> None:
324
+ super().__init__()
325
+ self.norm = nn.LayerNorm(in_dim)
326
+ self.mlp = MLP(in_dim, hidden_dim)
327
+
328
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
329
+ x = x + self.mlp(self.norm(x))
330
+ return x
331
+
332
+
333
+ class MLP(nn.Module):
334
+ def __init__(self, in_dim, hidden_dim):
335
+ super().__init__()
336
+ self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
337
+ self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
338
+ self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
339
+
340
+ def forward(self, x: torch.Tensor):
341
+ x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
342
+ x = self.c_proj(x)
343
+ return x
344
+
345
+
346
  class TextEncoder(nn.Module):
347
  def __init__(
348
  self,
 
354
  n_layers,
355
  kernel_size,
356
  p_dropout,
357
+ n_speakers,
358
  gin_channels=0,
359
  ):
360
  super().__init__()
 
376
  self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
377
  self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
378
  self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
379
+ # self.emo_proj = nn.Linear(512, hidden_channels)
380
+ self.in_feature_net = nn.Sequential(
381
+ # input is assumed to an already normalized embedding
382
+ nn.Linear(512, 1028, bias=False),
383
+ nn.GELU(),
384
+ nn.LayerNorm(1028),
385
+ *[Block(1028, 512) for _ in range(1)],
386
+ nn.Linear(1028, 512, bias=False),
387
+ # normalize before passing to VQ?
388
+ # nn.GELU(),
389
+ # nn.LayerNorm(512),
390
+ )
391
+ self.emo_vq = VectorQuantize(
392
+ dim=512,
393
+ codebook_size=64,
394
+ codebook_dim=32,
395
+ commitment_weight=0.1,
396
+ decay=0.85,
397
+ heads=32,
398
+ kmeans_iters=20,
399
+ separate_codebook_per_head=True,
400
+ stochastic_sample_codes=True,
401
+ threshold_ema_dead_code=2,
402
+ )
403
+ self.out_feature_net = nn.Linear(512, hidden_channels)
404
 
405
  self.encoder = attentions.Encoder(
406
  hidden_channels,
 
414
  self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
415
 
416
  def forward(
417
+ self, x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=None
418
  ):
419
+ sid = sid.cpu()
420
  bert_emb = self.bert_proj(bert).transpose(1, 2)
421
  ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2)
422
  en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
423
+ emo_emb = self.in_feature_net(emo)
424
+ emo_emb, _, loss_commit = self.emo_vq(emo_emb.unsqueeze(1))
425
+ loss_commit = loss_commit.mean()
426
+ emo_emb = self.out_feature_net(emo_emb)
427
+ # emo_emb = self.emo_proj(emo.unsqueeze(1))
428
  x = (
429
  self.emb(x)
430
  + self.tone_emb(tone)
 
432
  + bert_emb
433
  + ja_bert_emb
434
  + en_bert_emb
435
+ + emo_emb
436
  ) * math.sqrt(
437
  self.hidden_channels
438
  ) # [b, t, h]
 
445
  stats = self.proj(x) * x_mask
446
 
447
  m, logs = torch.split(stats, self.out_channels, dim=1)
448
+ return x, m, logs, x_mask, loss_commit
449
 
450
 
451
  class ResidualCouplingBlock(nn.Module):
 
878
  n_layers,
879
  kernel_size,
880
  p_dropout,
881
+ self.n_speakers,
882
  gin_channels=self.enc_gin_channels,
883
  )
884
  self.dec = Generator(
 
952
  g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
953
  else:
954
  g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
955
+ x, m_p, logs_p, x_mask, loss_commit = self.enc_p(
956
+ x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=g
957
  )
958
  z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
959
  z_p = self.flow(z, y_mask, g=g)
 
1019
  y_mask,
1020
  (z, z_p, m_p, logs_p, m_q, logs_q),
1021
  (x, logw, logw_),
1022
+ g,
1023
+ loss_commit,
1024
  )
1025
 
1026
  def infer(
 
1033
  bert,
1034
  ja_bert,
1035
  en_bert,
1036
+ emo=None,
1037
  noise_scale=0.667,
1038
  length_scale=1,
1039
  noise_scale_w=0.8,
 
1047
  g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
1048
  else:
1049
  g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
1050
+ x, m_p, logs_p, x_mask, _ = self.enc_p(
1051
+ x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=g
1052
  )
1053
  logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (
1054
  sdp_ratio
monotonic_align/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/monotonic_align/__pycache__/__init__.cpython-311.pyc and b/monotonic_align/__pycache__/__init__.cpython-311.pyc differ
 
monotonic_align/__pycache__/core.cpython-311.pyc CHANGED
Binary files a/monotonic_align/__pycache__/core.cpython-311.pyc and b/monotonic_align/__pycache__/core.cpython-311.pyc differ
 
onnx_modules/V200/__init__.py ADDED
File without changes
onnx_modules/V200/attentions_onnx.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ import commons
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class LayerNorm(nn.Module):
13
+ def __init__(self, channels, eps=1e-5):
14
+ super().__init__()
15
+ self.channels = channels
16
+ self.eps = eps
17
+
18
+ self.gamma = nn.Parameter(torch.ones(channels))
19
+ self.beta = nn.Parameter(torch.zeros(channels))
20
+
21
+ def forward(self, x):
22
+ x = x.transpose(1, -1)
23
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
24
+ return x.transpose(1, -1)
25
+
26
+
27
+ @torch.jit.script
28
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
29
+ n_channels_int = n_channels[0]
30
+ in_act = input_a + input_b
31
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
32
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
33
+ acts = t_act * s_act
34
+ return acts
35
+
36
+
37
+ class Encoder(nn.Module):
38
+ def __init__(
39
+ self,
40
+ hidden_channels,
41
+ filter_channels,
42
+ n_heads,
43
+ n_layers,
44
+ kernel_size=1,
45
+ p_dropout=0.0,
46
+ window_size=4,
47
+ isflow=True,
48
+ **kwargs
49
+ ):
50
+ super().__init__()
51
+ self.hidden_channels = hidden_channels
52
+ self.filter_channels = filter_channels
53
+ self.n_heads = n_heads
54
+ self.n_layers = n_layers
55
+ self.kernel_size = kernel_size
56
+ self.p_dropout = p_dropout
57
+ self.window_size = window_size
58
+ # if isflow:
59
+ # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
60
+ # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
61
+ # self.cond_layer = weight_norm(cond_layer, name='weight')
62
+ # self.gin_channels = 256
63
+ self.cond_layer_idx = self.n_layers
64
+ if "gin_channels" in kwargs:
65
+ self.gin_channels = kwargs["gin_channels"]
66
+ if self.gin_channels != 0:
67
+ self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
68
+ # vits2 says 3rd block, so idx is 2 by default
69
+ self.cond_layer_idx = (
70
+ kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
71
+ )
72
+ logging.debug(self.gin_channels, self.cond_layer_idx)
73
+ assert (
74
+ self.cond_layer_idx < self.n_layers
75
+ ), "cond_layer_idx should be less than n_layers"
76
+ self.drop = nn.Dropout(p_dropout)
77
+ self.attn_layers = nn.ModuleList()
78
+ self.norm_layers_1 = nn.ModuleList()
79
+ self.ffn_layers = nn.ModuleList()
80
+ self.norm_layers_2 = nn.ModuleList()
81
+ for i in range(self.n_layers):
82
+ self.attn_layers.append(
83
+ MultiHeadAttention(
84
+ hidden_channels,
85
+ hidden_channels,
86
+ n_heads,
87
+ p_dropout=p_dropout,
88
+ window_size=window_size,
89
+ )
90
+ )
91
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
92
+ self.ffn_layers.append(
93
+ FFN(
94
+ hidden_channels,
95
+ hidden_channels,
96
+ filter_channels,
97
+ kernel_size,
98
+ p_dropout=p_dropout,
99
+ )
100
+ )
101
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
102
+
103
+ def forward(self, x, x_mask, g=None):
104
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
105
+ x = x * x_mask
106
+ for i in range(self.n_layers):
107
+ if i == self.cond_layer_idx and g is not None:
108
+ g = self.spk_emb_linear(g.transpose(1, 2))
109
+ g = g.transpose(1, 2)
110
+ x = x + g
111
+ x = x * x_mask
112
+ y = self.attn_layers[i](x, x, attn_mask)
113
+ y = self.drop(y)
114
+ x = self.norm_layers_1[i](x + y)
115
+
116
+ y = self.ffn_layers[i](x, x_mask)
117
+ y = self.drop(y)
118
+ x = self.norm_layers_2[i](x + y)
119
+ x = x * x_mask
120
+ return x
121
+
122
+
123
+ class MultiHeadAttention(nn.Module):
124
+ def __init__(
125
+ self,
126
+ channels,
127
+ out_channels,
128
+ n_heads,
129
+ p_dropout=0.0,
130
+ window_size=None,
131
+ heads_share=True,
132
+ block_length=None,
133
+ proximal_bias=False,
134
+ proximal_init=False,
135
+ ):
136
+ super().__init__()
137
+ assert channels % n_heads == 0
138
+
139
+ self.channels = channels
140
+ self.out_channels = out_channels
141
+ self.n_heads = n_heads
142
+ self.p_dropout = p_dropout
143
+ self.window_size = window_size
144
+ self.heads_share = heads_share
145
+ self.block_length = block_length
146
+ self.proximal_bias = proximal_bias
147
+ self.proximal_init = proximal_init
148
+ self.attn = None
149
+
150
+ self.k_channels = channels // n_heads
151
+ self.conv_q = nn.Conv1d(channels, channels, 1)
152
+ self.conv_k = nn.Conv1d(channels, channels, 1)
153
+ self.conv_v = nn.Conv1d(channels, channels, 1)
154
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
155
+ self.drop = nn.Dropout(p_dropout)
156
+
157
+ if window_size is not None:
158
+ n_heads_rel = 1 if heads_share else n_heads
159
+ rel_stddev = self.k_channels**-0.5
160
+ self.emb_rel_k = nn.Parameter(
161
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
162
+ * rel_stddev
163
+ )
164
+ self.emb_rel_v = nn.Parameter(
165
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
166
+ * rel_stddev
167
+ )
168
+
169
+ nn.init.xavier_uniform_(self.conv_q.weight)
170
+ nn.init.xavier_uniform_(self.conv_k.weight)
171
+ nn.init.xavier_uniform_(self.conv_v.weight)
172
+ if proximal_init:
173
+ with torch.no_grad():
174
+ self.conv_k.weight.copy_(self.conv_q.weight)
175
+ self.conv_k.bias.copy_(self.conv_q.bias)
176
+
177
+ def forward(self, x, c, attn_mask=None):
178
+ q = self.conv_q(x)
179
+ k = self.conv_k(c)
180
+ v = self.conv_v(c)
181
+
182
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
183
+
184
+ x = self.conv_o(x)
185
+ return x
186
+
187
+ def attention(self, query, key, value, mask=None):
188
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
189
+ b, d, t_s, t_t = (*key.size(), query.size(2))
190
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
191
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
192
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
193
+
194
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
195
+ if self.window_size is not None:
196
+ assert (
197
+ t_s == t_t
198
+ ), "Relative attention is only available for self-attention."
199
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
200
+ rel_logits = self._matmul_with_relative_keys(
201
+ query / math.sqrt(self.k_channels), key_relative_embeddings
202
+ )
203
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
204
+ scores = scores + scores_local
205
+ if self.proximal_bias:
206
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
207
+ scores = scores + self._attention_bias_proximal(t_s).to(
208
+ device=scores.device, dtype=scores.dtype
209
+ )
210
+ if mask is not None:
211
+ scores = scores.masked_fill(mask == 0, -1e4)
212
+ if self.block_length is not None:
213
+ assert (
214
+ t_s == t_t
215
+ ), "Local attention is only available for self-attention."
216
+ block_mask = (
217
+ torch.ones_like(scores)
218
+ .triu(-self.block_length)
219
+ .tril(self.block_length)
220
+ )
221
+ scores = scores.masked_fill(block_mask == 0, -1e4)
222
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
223
+ p_attn = self.drop(p_attn)
224
+ output = torch.matmul(p_attn, value)
225
+ if self.window_size is not None:
226
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
227
+ value_relative_embeddings = self._get_relative_embeddings(
228
+ self.emb_rel_v, t_s
229
+ )
230
+ output = output + self._matmul_with_relative_values(
231
+ relative_weights, value_relative_embeddings
232
+ )
233
+ output = (
234
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
235
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
236
+ return output, p_attn
237
+
238
+ def _matmul_with_relative_values(self, x, y):
239
+ """
240
+ x: [b, h, l, m]
241
+ y: [h or 1, m, d]
242
+ ret: [b, h, l, d]
243
+ """
244
+ ret = torch.matmul(x, y.unsqueeze(0))
245
+ return ret
246
+
247
+ def _matmul_with_relative_keys(self, x, y):
248
+ """
249
+ x: [b, h, l, d]
250
+ y: [h or 1, m, d]
251
+ ret: [b, h, l, m]
252
+ """
253
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
254
+ return ret
255
+
256
+ def _get_relative_embeddings(self, relative_embeddings, length):
257
+ max_relative_position = 2 * self.window_size + 1
258
+ # Pad first before slice to avoid using cond ops.
259
+ pad_length = max(length - (self.window_size + 1), 0)
260
+ slice_start_position = max((self.window_size + 1) - length, 0)
261
+ slice_end_position = slice_start_position + 2 * length - 1
262
+ if pad_length > 0:
263
+ padded_relative_embeddings = F.pad(
264
+ relative_embeddings,
265
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
266
+ )
267
+ else:
268
+ padded_relative_embeddings = relative_embeddings
269
+ used_relative_embeddings = padded_relative_embeddings[
270
+ :, slice_start_position:slice_end_position
271
+ ]
272
+ return used_relative_embeddings
273
+
274
+ def _relative_position_to_absolute_position(self, x):
275
+ """
276
+ x: [b, h, l, 2*l-1]
277
+ ret: [b, h, l, l]
278
+ """
279
+ batch, heads, length, _ = x.size()
280
+ # Concat columns of pad to shift from relative to absolute indexing.
281
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
282
+
283
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
284
+ x_flat = x.view([batch, heads, length * 2 * length])
285
+ x_flat = F.pad(
286
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
287
+ )
288
+
289
+ # Reshape and slice out the padded elements.
290
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
291
+ :, :, :length, length - 1 :
292
+ ]
293
+ return x_final
294
+
295
+ def _absolute_position_to_relative_position(self, x):
296
+ """
297
+ x: [b, h, l, l]
298
+ ret: [b, h, l, 2*l-1]
299
+ """
300
+ batch, heads, length, _ = x.size()
301
+ # padd along column
302
+ x = F.pad(
303
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
304
+ )
305
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
306
+ # add 0's in the beginning that will skew the elements after reshape
307
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
308
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
309
+ return x_final
310
+
311
+ def _attention_bias_proximal(self, length):
312
+ """Bias for self-attention to encourage attention to close positions.
313
+ Args:
314
+ length: an integer scalar.
315
+ Returns:
316
+ a Tensor with shape [1, 1, length, length]
317
+ """
318
+ r = torch.arange(length, dtype=torch.float32)
319
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
320
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
321
+
322
+
323
+ class FFN(nn.Module):
324
+ def __init__(
325
+ self,
326
+ in_channels,
327
+ out_channels,
328
+ filter_channels,
329
+ kernel_size,
330
+ p_dropout=0.0,
331
+ activation=None,
332
+ causal=False,
333
+ ):
334
+ super().__init__()
335
+ self.in_channels = in_channels
336
+ self.out_channels = out_channels
337
+ self.filter_channels = filter_channels
338
+ self.kernel_size = kernel_size
339
+ self.p_dropout = p_dropout
340
+ self.activation = activation
341
+ self.causal = causal
342
+
343
+ if causal:
344
+ self.padding = self._causal_padding
345
+ else:
346
+ self.padding = self._same_padding
347
+
348
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
349
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
350
+ self.drop = nn.Dropout(p_dropout)
351
+
352
+ def forward(self, x, x_mask):
353
+ x = self.conv_1(self.padding(x * x_mask))
354
+ if self.activation == "gelu":
355
+ x = x * torch.sigmoid(1.702 * x)
356
+ else:
357
+ x = torch.relu(x)
358
+ x = self.drop(x)
359
+ x = self.conv_2(self.padding(x * x_mask))
360
+ return x * x_mask
361
+
362
+ def _causal_padding(self, x):
363
+ if self.kernel_size == 1:
364
+ return x
365
+ pad_l = self.kernel_size - 1
366
+ pad_r = 0
367
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
368
+ x = F.pad(x, commons.convert_pad_shape(padding))
369
+ return x
370
+
371
+ def _same_padding(self, x):
372
+ if self.kernel_size == 1:
373
+ return x
374
+ pad_l = (self.kernel_size - 1) // 2
375
+ pad_r = self.kernel_size // 2
376
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
377
+ x = F.pad(x, commons.convert_pad_shape(padding))
378
+ return x
onnx_modules/V200/models_onnx.py ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ import commons
7
+ import modules
8
+ from . import attentions_onnx
9
+
10
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from commons import init_weights, get_padding
13
+ from .text import symbols, num_tones, num_languages
14
+
15
+
16
+ class DurationDiscriminator(nn.Module): # vits2
17
+ def __init__(
18
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
19
+ ):
20
+ super().__init__()
21
+
22
+ self.in_channels = in_channels
23
+ self.filter_channels = filter_channels
24
+ self.kernel_size = kernel_size
25
+ self.p_dropout = p_dropout
26
+ self.gin_channels = gin_channels
27
+
28
+ self.drop = nn.Dropout(p_dropout)
29
+ self.conv_1 = nn.Conv1d(
30
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
31
+ )
32
+ self.norm_1 = modules.LayerNorm(filter_channels)
33
+ self.conv_2 = nn.Conv1d(
34
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
35
+ )
36
+ self.norm_2 = modules.LayerNorm(filter_channels)
37
+ self.dur_proj = nn.Conv1d(1, filter_channels, 1)
38
+
39
+ self.pre_out_conv_1 = nn.Conv1d(
40
+ 2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
41
+ )
42
+ self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
43
+ self.pre_out_conv_2 = nn.Conv1d(
44
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
45
+ )
46
+ self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
47
+
48
+ if gin_channels != 0:
49
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
50
+
51
+ self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid())
52
+
53
+ def forward_probability(self, x, x_mask, dur, g=None):
54
+ dur = self.dur_proj(dur)
55
+ x = torch.cat([x, dur], dim=1)
56
+ x = self.pre_out_conv_1(x * x_mask)
57
+ x = torch.relu(x)
58
+ x = self.pre_out_norm_1(x)
59
+ x = self.drop(x)
60
+ x = self.pre_out_conv_2(x * x_mask)
61
+ x = torch.relu(x)
62
+ x = self.pre_out_norm_2(x)
63
+ x = self.drop(x)
64
+ x = x * x_mask
65
+ x = x.transpose(1, 2)
66
+ output_prob = self.output_layer(x)
67
+ return output_prob
68
+
69
+ def forward(self, x, x_mask, dur_r, dur_hat, g=None):
70
+ x = torch.detach(x)
71
+ if g is not None:
72
+ g = torch.detach(g)
73
+ x = x + self.cond(g)
74
+ x = self.conv_1(x * x_mask)
75
+ x = torch.relu(x)
76
+ x = self.norm_1(x)
77
+ x = self.drop(x)
78
+ x = self.conv_2(x * x_mask)
79
+ x = torch.relu(x)
80
+ x = self.norm_2(x)
81
+ x = self.drop(x)
82
+
83
+ output_probs = []
84
+ for dur in [dur_r, dur_hat]:
85
+ output_prob = self.forward_probability(x, x_mask, dur, g)
86
+ output_probs.append(output_prob)
87
+
88
+ return output_probs
89
+
90
+
91
+ class TransformerCouplingBlock(nn.Module):
92
+ def __init__(
93
+ self,
94
+ channels,
95
+ hidden_channels,
96
+ filter_channels,
97
+ n_heads,
98
+ n_layers,
99
+ kernel_size,
100
+ p_dropout,
101
+ n_flows=4,
102
+ gin_channels=0,
103
+ share_parameter=False,
104
+ ):
105
+ super().__init__()
106
+ self.channels = channels
107
+ self.hidden_channels = hidden_channels
108
+ self.kernel_size = kernel_size
109
+ self.n_layers = n_layers
110
+ self.n_flows = n_flows
111
+ self.gin_channels = gin_channels
112
+
113
+ self.flows = nn.ModuleList()
114
+
115
+ self.wn = (
116
+ attentions_onnx.FFT(
117
+ hidden_channels,
118
+ filter_channels,
119
+ n_heads,
120
+ n_layers,
121
+ kernel_size,
122
+ p_dropout,
123
+ isflow=True,
124
+ gin_channels=self.gin_channels,
125
+ )
126
+ if share_parameter
127
+ else None
128
+ )
129
+
130
+ for i in range(n_flows):
131
+ self.flows.append(
132
+ modules.TransformerCouplingLayer(
133
+ channels,
134
+ hidden_channels,
135
+ kernel_size,
136
+ n_layers,
137
+ n_heads,
138
+ p_dropout,
139
+ filter_channels,
140
+ mean_only=True,
141
+ wn_sharing_parameter=self.wn,
142
+ gin_channels=self.gin_channels,
143
+ )
144
+ )
145
+ self.flows.append(modules.Flip())
146
+
147
+ def forward(self, x, x_mask, g=None, reverse=True):
148
+ if not reverse:
149
+ for flow in self.flows:
150
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
151
+ else:
152
+ for flow in reversed(self.flows):
153
+ x = flow(x, x_mask, g=g, reverse=reverse)
154
+ return x
155
+
156
+
157
+ class StochasticDurationPredictor(nn.Module):
158
+ def __init__(
159
+ self,
160
+ in_channels,
161
+ filter_channels,
162
+ kernel_size,
163
+ p_dropout,
164
+ n_flows=4,
165
+ gin_channels=0,
166
+ ):
167
+ super().__init__()
168
+ filter_channels = in_channels # it needs to be removed from future version.
169
+ self.in_channels = in_channels
170
+ self.filter_channels = filter_channels
171
+ self.kernel_size = kernel_size
172
+ self.p_dropout = p_dropout
173
+ self.n_flows = n_flows
174
+ self.gin_channels = gin_channels
175
+
176
+ self.log_flow = modules.Log()
177
+ self.flows = nn.ModuleList()
178
+ self.flows.append(modules.ElementwiseAffine(2))
179
+ for i in range(n_flows):
180
+ self.flows.append(
181
+ modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
182
+ )
183
+ self.flows.append(modules.Flip())
184
+
185
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
186
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
187
+ self.post_convs = modules.DDSConv(
188
+ filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
189
+ )
190
+ self.post_flows = nn.ModuleList()
191
+ self.post_flows.append(modules.ElementwiseAffine(2))
192
+ for i in range(4):
193
+ self.post_flows.append(
194
+ modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
195
+ )
196
+ self.post_flows.append(modules.Flip())
197
+
198
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
199
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
200
+ self.convs = modules.DDSConv(
201
+ filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
202
+ )
203
+ if gin_channels != 0:
204
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
205
+
206
+ def forward(self, x, x_mask, z, g=None):
207
+ x = torch.detach(x)
208
+ x = self.pre(x)
209
+ if g is not None:
210
+ g = torch.detach(g)
211
+ x = x + self.cond(g)
212
+ x = self.convs(x, x_mask)
213
+ x = self.proj(x) * x_mask
214
+
215
+ flows = list(reversed(self.flows))
216
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
217
+ for flow in flows:
218
+ z = flow(z, x_mask, g=x, reverse=True)
219
+ z0, z1 = torch.split(z, [1, 1], 1)
220
+ logw = z0
221
+ return logw
222
+
223
+
224
+ class DurationPredictor(nn.Module):
225
+ def __init__(
226
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
227
+ ):
228
+ super().__init__()
229
+
230
+ self.in_channels = in_channels
231
+ self.filter_channels = filter_channels
232
+ self.kernel_size = kernel_size
233
+ self.p_dropout = p_dropout
234
+ self.gin_channels = gin_channels
235
+
236
+ self.drop = nn.Dropout(p_dropout)
237
+ self.conv_1 = nn.Conv1d(
238
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
239
+ )
240
+ self.norm_1 = modules.LayerNorm(filter_channels)
241
+ self.conv_2 = nn.Conv1d(
242
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
243
+ )
244
+ self.norm_2 = modules.LayerNorm(filter_channels)
245
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
246
+
247
+ if gin_channels != 0:
248
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
249
+
250
+ def forward(self, x, x_mask, g=None):
251
+ x = torch.detach(x)
252
+ if g is not None:
253
+ g = torch.detach(g)
254
+ x = x + self.cond(g)
255
+ x = self.conv_1(x * x_mask)
256
+ x = torch.relu(x)
257
+ x = self.norm_1(x)
258
+ x = self.drop(x)
259
+ x = self.conv_2(x * x_mask)
260
+ x = torch.relu(x)
261
+ x = self.norm_2(x)
262
+ x = self.drop(x)
263
+ x = self.proj(x * x_mask)
264
+ return x * x_mask
265
+
266
+
267
+ class TextEncoder(nn.Module):
268
+ def __init__(
269
+ self,
270
+ n_vocab,
271
+ out_channels,
272
+ hidden_channels,
273
+ filter_channels,
274
+ n_heads,
275
+ n_layers,
276
+ kernel_size,
277
+ p_dropout,
278
+ gin_channels=0,
279
+ ):
280
+ super().__init__()
281
+ self.n_vocab = n_vocab
282
+ self.out_channels = out_channels
283
+ self.hidden_channels = hidden_channels
284
+ self.filter_channels = filter_channels
285
+ self.n_heads = n_heads
286
+ self.n_layers = n_layers
287
+ self.kernel_size = kernel_size
288
+ self.p_dropout = p_dropout
289
+ self.gin_channels = gin_channels
290
+ self.emb = nn.Embedding(len(symbols), hidden_channels)
291
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
292
+ self.tone_emb = nn.Embedding(num_tones, hidden_channels)
293
+ nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels**-0.5)
294
+ self.language_emb = nn.Embedding(num_languages, hidden_channels)
295
+ nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5)
296
+ self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
297
+ self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
298
+ self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
299
+
300
+ self.encoder = attentions_onnx.Encoder(
301
+ hidden_channels,
302
+ filter_channels,
303
+ n_heads,
304
+ n_layers,
305
+ kernel_size,
306
+ p_dropout,
307
+ gin_channels=self.gin_channels,
308
+ )
309
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
310
+
311
+ def forward(self, x, x_lengths, tone, language, bert, ja_bert, en_bert, g=None):
312
+ x_mask = torch.ones_like(x).unsqueeze(0)
313
+ bert_emb = self.bert_proj(bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2)
314
+ ja_bert_emb = self.ja_bert_proj(ja_bert.transpose(0, 1).unsqueeze(0)).transpose(
315
+ 1, 2
316
+ )
317
+ en_bert_emb = self.en_bert_proj(en_bert.transpose(0, 1).unsqueeze(0)).transpose(
318
+ 1, 2
319
+ )
320
+ x = (
321
+ self.emb(x)
322
+ + self.tone_emb(tone)
323
+ + self.language_emb(language)
324
+ + bert_emb
325
+ + ja_bert_emb
326
+ + en_bert_emb
327
+ ) * math.sqrt(
328
+ self.hidden_channels
329
+ ) # [b, t, h]
330
+ x = torch.transpose(x, 1, -1) # [b, h, t]
331
+ x_mask = x_mask.to(x.dtype)
332
+
333
+ x = self.encoder(x * x_mask, x_mask, g=g)
334
+ stats = self.proj(x) * x_mask
335
+
336
+ m, logs = torch.split(stats, self.out_channels, dim=1)
337
+ return x, m, logs, x_mask
338
+
339
+
340
+ class ResidualCouplingBlock(nn.Module):
341
+ def __init__(
342
+ self,
343
+ channels,
344
+ hidden_channels,
345
+ kernel_size,
346
+ dilation_rate,
347
+ n_layers,
348
+ n_flows=4,
349
+ gin_channels=0,
350
+ ):
351
+ super().__init__()
352
+ self.channels = channels
353
+ self.hidden_channels = hidden_channels
354
+ self.kernel_size = kernel_size
355
+ self.dilation_rate = dilation_rate
356
+ self.n_layers = n_layers
357
+ self.n_flows = n_flows
358
+ self.gin_channels = gin_channels
359
+
360
+ self.flows = nn.ModuleList()
361
+ for i in range(n_flows):
362
+ self.flows.append(
363
+ modules.ResidualCouplingLayer(
364
+ channels,
365
+ hidden_channels,
366
+ kernel_size,
367
+ dilation_rate,
368
+ n_layers,
369
+ gin_channels=gin_channels,
370
+ mean_only=True,
371
+ )
372
+ )
373
+ self.flows.append(modules.Flip())
374
+
375
+ def forward(self, x, x_mask, g=None, reverse=True):
376
+ if not reverse:
377
+ for flow in self.flows:
378
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
379
+ else:
380
+ for flow in reversed(self.flows):
381
+ x = flow(x, x_mask, g=g, reverse=reverse)
382
+ return x
383
+
384
+
385
+ class PosteriorEncoder(nn.Module):
386
+ def __init__(
387
+ self,
388
+ in_channels,
389
+ out_channels,
390
+ hidden_channels,
391
+ kernel_size,
392
+ dilation_rate,
393
+ n_layers,
394
+ gin_channels=0,
395
+ ):
396
+ super().__init__()
397
+ self.in_channels = in_channels
398
+ self.out_channels = out_channels
399
+ self.hidden_channels = hidden_channels
400
+ self.kernel_size = kernel_size
401
+ self.dilation_rate = dilation_rate
402
+ self.n_layers = n_layers
403
+ self.gin_channels = gin_channels
404
+
405
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
406
+ self.enc = modules.WN(
407
+ hidden_channels,
408
+ kernel_size,
409
+ dilation_rate,
410
+ n_layers,
411
+ gin_channels=gin_channels,
412
+ )
413
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
414
+
415
+ def forward(self, x, x_lengths, g=None):
416
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
417
+ x.dtype
418
+ )
419
+ x = self.pre(x) * x_mask
420
+ x = self.enc(x, x_mask, g=g)
421
+ stats = self.proj(x) * x_mask
422
+ m, logs = torch.split(stats, self.out_channels, dim=1)
423
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
424
+ return z, m, logs, x_mask
425
+
426
+
427
+ class Generator(torch.nn.Module):
428
+ def __init__(
429
+ self,
430
+ initial_channel,
431
+ resblock,
432
+ resblock_kernel_sizes,
433
+ resblock_dilation_sizes,
434
+ upsample_rates,
435
+ upsample_initial_channel,
436
+ upsample_kernel_sizes,
437
+ gin_channels=0,
438
+ ):
439
+ super(Generator, self).__init__()
440
+ self.num_kernels = len(resblock_kernel_sizes)
441
+ self.num_upsamples = len(upsample_rates)
442
+ self.conv_pre = Conv1d(
443
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
444
+ )
445
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
446
+
447
+ self.ups = nn.ModuleList()
448
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
449
+ self.ups.append(
450
+ weight_norm(
451
+ ConvTranspose1d(
452
+ upsample_initial_channel // (2**i),
453
+ upsample_initial_channel // (2 ** (i + 1)),
454
+ k,
455
+ u,
456
+ padding=(k - u) // 2,
457
+ )
458
+ )
459
+ )
460
+
461
+ self.resblocks = nn.ModuleList()
462
+ for i in range(len(self.ups)):
463
+ ch = upsample_initial_channel // (2 ** (i + 1))
464
+ for j, (k, d) in enumerate(
465
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
466
+ ):
467
+ self.resblocks.append(resblock(ch, k, d))
468
+
469
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
470
+ self.ups.apply(init_weights)
471
+
472
+ if gin_channels != 0:
473
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
474
+
475
+ def forward(self, x, g=None):
476
+ x = self.conv_pre(x)
477
+ if g is not None:
478
+ x = x + self.cond(g)
479
+
480
+ for i in range(self.num_upsamples):
481
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
482
+ x = self.ups[i](x)
483
+ xs = None
484
+ for j in range(self.num_kernels):
485
+ if xs is None:
486
+ xs = self.resblocks[i * self.num_kernels + j](x)
487
+ else:
488
+ xs += self.resblocks[i * self.num_kernels + j](x)
489
+ x = xs / self.num_kernels
490
+ x = F.leaky_relu(x)
491
+ x = self.conv_post(x)
492
+ x = torch.tanh(x)
493
+
494
+ return x
495
+
496
+ def remove_weight_norm(self):
497
+ print("Removing weight norm...")
498
+ for layer in self.ups:
499
+ remove_weight_norm(layer)
500
+ for layer in self.resblocks:
501
+ layer.remove_weight_norm()
502
+
503
+
504
+ class DiscriminatorP(torch.nn.Module):
505
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
506
+ super(DiscriminatorP, self).__init__()
507
+ self.period = period
508
+ self.use_spectral_norm = use_spectral_norm
509
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
510
+ self.convs = nn.ModuleList(
511
+ [
512
+ norm_f(
513
+ Conv2d(
514
+ 1,
515
+ 32,
516
+ (kernel_size, 1),
517
+ (stride, 1),
518
+ padding=(get_padding(kernel_size, 1), 0),
519
+ )
520
+ ),
521
+ norm_f(
522
+ Conv2d(
523
+ 32,
524
+ 128,
525
+ (kernel_size, 1),
526
+ (stride, 1),
527
+ padding=(get_padding(kernel_size, 1), 0),
528
+ )
529
+ ),
530
+ norm_f(
531
+ Conv2d(
532
+ 128,
533
+ 512,
534
+ (kernel_size, 1),
535
+ (stride, 1),
536
+ padding=(get_padding(kernel_size, 1), 0),
537
+ )
538
+ ),
539
+ norm_f(
540
+ Conv2d(
541
+ 512,
542
+ 1024,
543
+ (kernel_size, 1),
544
+ (stride, 1),
545
+ padding=(get_padding(kernel_size, 1), 0),
546
+ )
547
+ ),
548
+ norm_f(
549
+ Conv2d(
550
+ 1024,
551
+ 1024,
552
+ (kernel_size, 1),
553
+ 1,
554
+ padding=(get_padding(kernel_size, 1), 0),
555
+ )
556
+ ),
557
+ ]
558
+ )
559
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
560
+
561
+ def forward(self, x):
562
+ fmap = []
563
+
564
+ # 1d to 2d
565
+ b, c, t = x.shape
566
+ if t % self.period != 0: # pad first
567
+ n_pad = self.period - (t % self.period)
568
+ x = F.pad(x, (0, n_pad), "reflect")
569
+ t = t + n_pad
570
+ x = x.view(b, c, t // self.period, self.period)
571
+
572
+ for layer in self.convs:
573
+ x = layer(x)
574
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
575
+ fmap.append(x)
576
+ x = self.conv_post(x)
577
+ fmap.append(x)
578
+ x = torch.flatten(x, 1, -1)
579
+
580
+ return x, fmap
581
+
582
+
583
+ class DiscriminatorS(torch.nn.Module):
584
+ def __init__(self, use_spectral_norm=False):
585
+ super(DiscriminatorS, self).__init__()
586
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
587
+ self.convs = nn.ModuleList(
588
+ [
589
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
590
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
591
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
592
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
593
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
594
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
595
+ ]
596
+ )
597
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
598
+
599
+ def forward(self, x):
600
+ fmap = []
601
+
602
+ for layer in self.convs:
603
+ x = layer(x)
604
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
605
+ fmap.append(x)
606
+ x = self.conv_post(x)
607
+ fmap.append(x)
608
+ x = torch.flatten(x, 1, -1)
609
+
610
+ return x, fmap
611
+
612
+
613
+ class MultiPeriodDiscriminator(torch.nn.Module):
614
+ def __init__(self, use_spectral_norm=False):
615
+ super(MultiPeriodDiscriminator, self).__init__()
616
+ periods = [2, 3, 5, 7, 11]
617
+
618
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
619
+ discs = discs + [
620
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
621
+ ]
622
+ self.discriminators = nn.ModuleList(discs)
623
+
624
+ def forward(self, y, y_hat):
625
+ y_d_rs = []
626
+ y_d_gs = []
627
+ fmap_rs = []
628
+ fmap_gs = []
629
+ for i, d in enumerate(self.discriminators):
630
+ y_d_r, fmap_r = d(y)
631
+ y_d_g, fmap_g = d(y_hat)
632
+ y_d_rs.append(y_d_r)
633
+ y_d_gs.append(y_d_g)
634
+ fmap_rs.append(fmap_r)
635
+ fmap_gs.append(fmap_g)
636
+
637
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
638
+
639
+
640
+ class ReferenceEncoder(nn.Module):
641
+ """
642
+ inputs --- [N, Ty/r, n_mels*r] mels
643
+ outputs --- [N, ref_enc_gru_size]
644
+ """
645
+
646
+ def __init__(self, spec_channels, gin_channels=0):
647
+ super().__init__()
648
+ self.spec_channels = spec_channels
649
+ ref_enc_filters = [32, 32, 64, 64, 128, 128]
650
+ K = len(ref_enc_filters)
651
+ filters = [1] + ref_enc_filters
652
+ convs = [
653
+ weight_norm(
654
+ nn.Conv2d(
655
+ in_channels=filters[i],
656
+ out_channels=filters[i + 1],
657
+ kernel_size=(3, 3),
658
+ stride=(2, 2),
659
+ padding=(1, 1),
660
+ )
661
+ )
662
+ for i in range(K)
663
+ ]
664
+ self.convs = nn.ModuleList(convs)
665
+ # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501
666
+
667
+ out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
668
+ self.gru = nn.GRU(
669
+ input_size=ref_enc_filters[-1] * out_channels,
670
+ hidden_size=256 // 2,
671
+ batch_first=True,
672
+ )
673
+ self.proj = nn.Linear(128, gin_channels)
674
+
675
+ def forward(self, inputs, mask=None):
676
+ N = inputs.size(0)
677
+ out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
678
+ for conv in self.convs:
679
+ out = conv(out)
680
+ # out = wn(out)
681
+ out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
682
+
683
+ out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
684
+ T = out.size(1)
685
+ N = out.size(0)
686
+ out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
687
+
688
+ self.gru.flatten_parameters()
689
+ memory, out = self.gru(out) # out --- [1, N, 128]
690
+
691
+ return self.proj(out.squeeze(0))
692
+
693
+ def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
694
+ for i in range(n_convs):
695
+ L = (L - kernel_size + 2 * pad) // stride + 1
696
+ return L
697
+
698
+
699
+ class SynthesizerTrn(nn.Module):
700
+ """
701
+ Synthesizer for Training
702
+ """
703
+
704
+ def __init__(
705
+ self,
706
+ n_vocab,
707
+ spec_channels,
708
+ segment_size,
709
+ inter_channels,
710
+ hidden_channels,
711
+ filter_channels,
712
+ n_heads,
713
+ n_layers,
714
+ kernel_size,
715
+ p_dropout,
716
+ resblock,
717
+ resblock_kernel_sizes,
718
+ resblock_dilation_sizes,
719
+ upsample_rates,
720
+ upsample_initial_channel,
721
+ upsample_kernel_sizes,
722
+ n_speakers=256,
723
+ gin_channels=256,
724
+ use_sdp=True,
725
+ n_flow_layer=4,
726
+ n_layers_trans_flow=4,
727
+ flow_share_parameter=False,
728
+ use_transformer_flow=True,
729
+ **kwargs,
730
+ ):
731
+ super().__init__()
732
+ self.n_vocab = n_vocab
733
+ self.spec_channels = spec_channels
734
+ self.inter_channels = inter_channels
735
+ self.hidden_channels = hidden_channels
736
+ self.filter_channels = filter_channels
737
+ self.n_heads = n_heads
738
+ self.n_layers = n_layers
739
+ self.kernel_size = kernel_size
740
+ self.p_dropout = p_dropout
741
+ self.resblock = resblock
742
+ self.resblock_kernel_sizes = resblock_kernel_sizes
743
+ self.resblock_dilation_sizes = resblock_dilation_sizes
744
+ self.upsample_rates = upsample_rates
745
+ self.upsample_initial_channel = upsample_initial_channel
746
+ self.upsample_kernel_sizes = upsample_kernel_sizes
747
+ self.segment_size = segment_size
748
+ self.n_speakers = n_speakers
749
+ self.gin_channels = gin_channels
750
+ self.n_layers_trans_flow = n_layers_trans_flow
751
+ self.use_spk_conditioned_encoder = kwargs.get(
752
+ "use_spk_conditioned_encoder", True
753
+ )
754
+ self.use_sdp = use_sdp
755
+ self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
756
+ self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
757
+ self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
758
+ self.current_mas_noise_scale = self.mas_noise_scale_initial
759
+ if self.use_spk_conditioned_encoder and gin_channels > 0:
760
+ self.enc_gin_channels = gin_channels
761
+ self.enc_p = TextEncoder(
762
+ n_vocab,
763
+ inter_channels,
764
+ hidden_channels,
765
+ filter_channels,
766
+ n_heads,
767
+ n_layers,
768
+ kernel_size,
769
+ p_dropout,
770
+ gin_channels=self.enc_gin_channels,
771
+ )
772
+ self.dec = Generator(
773
+ inter_channels,
774
+ resblock,
775
+ resblock_kernel_sizes,
776
+ resblock_dilation_sizes,
777
+ upsample_rates,
778
+ upsample_initial_channel,
779
+ upsample_kernel_sizes,
780
+ gin_channels=gin_channels,
781
+ )
782
+ self.enc_q = PosteriorEncoder(
783
+ spec_channels,
784
+ inter_channels,
785
+ hidden_channels,
786
+ 5,
787
+ 1,
788
+ 16,
789
+ gin_channels=gin_channels,
790
+ )
791
+ if use_transformer_flow:
792
+ self.flow = TransformerCouplingBlock(
793
+ inter_channels,
794
+ hidden_channels,
795
+ filter_channels,
796
+ n_heads,
797
+ n_layers_trans_flow,
798
+ 5,
799
+ p_dropout,
800
+ n_flow_layer,
801
+ gin_channels=gin_channels,
802
+ share_parameter=flow_share_parameter,
803
+ )
804
+ else:
805
+ self.flow = ResidualCouplingBlock(
806
+ inter_channels,
807
+ hidden_channels,
808
+ 5,
809
+ 1,
810
+ n_flow_layer,
811
+ gin_channels=gin_channels,
812
+ )
813
+ self.sdp = StochasticDurationPredictor(
814
+ hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
815
+ )
816
+ self.dp = DurationPredictor(
817
+ hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
818
+ )
819
+
820
+ if n_speakers >= 1:
821
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
822
+ else:
823
+ self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
824
+
825
+ def export_onnx(
826
+ self,
827
+ path,
828
+ max_len=None,
829
+ sdp_ratio=0,
830
+ y=None,
831
+ ):
832
+ noise_scale = 0.667
833
+ length_scale = 1
834
+ noise_scale_w = 0.8
835
+ x = (
836
+ torch.LongTensor(
837
+ [
838
+ 0,
839
+ 97,
840
+ 0,
841
+ 8,
842
+ 0,
843
+ 78,
844
+ 0,
845
+ 8,
846
+ 0,
847
+ 76,
848
+ 0,
849
+ 37,
850
+ 0,
851
+ 40,
852
+ 0,
853
+ 97,
854
+ 0,
855
+ 8,
856
+ 0,
857
+ 23,
858
+ 0,
859
+ 8,
860
+ 0,
861
+ 74,
862
+ 0,
863
+ 26,
864
+ 0,
865
+ 104,
866
+ 0,
867
+ ]
868
+ )
869
+ .unsqueeze(0)
870
+ .cpu()
871
+ )
872
+ tone = torch.zeros_like(x).cpu()
873
+ language = torch.zeros_like(x).cpu()
874
+ x_lengths = torch.LongTensor([x.shape[1]]).cpu()
875
+ sid = torch.LongTensor([0]).cpu()
876
+ bert = torch.randn(size=(x.shape[1], 1024)).cpu()
877
+ ja_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
878
+ en_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
879
+
880
+ if self.n_speakers > 0:
881
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
882
+ torch.onnx.export(
883
+ self.emb_g,
884
+ (sid),
885
+ f"onnx/{path}/{path}_emb.onnx",
886
+ input_names=["sid"],
887
+ output_names=["g"],
888
+ verbose=True,
889
+ )
890
+ else:
891
+ g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
892
+
893
+ torch.onnx.export(
894
+ self.enc_p,
895
+ (x, x_lengths, tone, language, bert, ja_bert, en_bert, g),
896
+ f"onnx/{path}/{path}_enc_p.onnx",
897
+ input_names=[
898
+ "x",
899
+ "x_lengths",
900
+ "t",
901
+ "language",
902
+ "bert_0",
903
+ "bert_1",
904
+ "bert_2",
905
+ "g",
906
+ ],
907
+ output_names=["xout", "m_p", "logs_p", "x_mask"],
908
+ dynamic_axes={
909
+ "x": [0, 1],
910
+ "t": [0, 1],
911
+ "language": [0, 1],
912
+ "bert_0": [0],
913
+ "bert_1": [0],
914
+ "bert_2": [0],
915
+ "xout": [0, 2],
916
+ "m_p": [0, 2],
917
+ "logs_p": [0, 2],
918
+ "x_mask": [0, 2],
919
+ },
920
+ verbose=True,
921
+ opset_version=16,
922
+ )
923
+ x, m_p, logs_p, x_mask = self.enc_p(
924
+ x, x_lengths, tone, language, bert, ja_bert, en_bert, g=g
925
+ )
926
+ zinput = (
927
+ torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
928
+ * noise_scale_w
929
+ )
930
+ torch.onnx.export(
931
+ self.sdp,
932
+ (x, x_mask, zinput, g),
933
+ f"onnx/{path}/{path}_sdp.onnx",
934
+ input_names=["x", "x_mask", "zin", "g"],
935
+ output_names=["logw"],
936
+ dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "zin": [0, 2], "logw": [0, 2]},
937
+ verbose=True,
938
+ )
939
+ torch.onnx.export(
940
+ self.dp,
941
+ (x, x_mask, g),
942
+ f"onnx/{path}/{path}_dp.onnx",
943
+ input_names=["x", "x_mask", "g"],
944
+ output_names=["logw"],
945
+ dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "logw": [0, 2]},
946
+ verbose=True,
947
+ )
948
+ logw = self.sdp(x, x_mask, zinput, g=g) * (sdp_ratio) + self.dp(
949
+ x, x_mask, g=g
950
+ ) * (1 - sdp_ratio)
951
+ w = torch.exp(logw) * x_mask * length_scale
952
+ w_ceil = torch.ceil(w)
953
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
954
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
955
+ x_mask.dtype
956
+ )
957
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
958
+ attn = commons.generate_path(w_ceil, attn_mask)
959
+
960
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
961
+ 1, 2
962
+ ) # [b, t', t], [b, t, d] -> [b, d, t']
963
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
964
+ 1, 2
965
+ ) # [b, t', t], [b, t, d] -> [b, d, t']
966
+
967
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
968
+ torch.onnx.export(
969
+ self.flow,
970
+ (z_p, y_mask, g),
971
+ f"onnx/{path}/{path}_flow.onnx",
972
+ input_names=["z_p", "y_mask", "g"],
973
+ output_names=["z"],
974
+ dynamic_axes={"z_p": [0, 2], "y_mask": [0, 2], "z": [0, 2]},
975
+ verbose=True,
976
+ )
977
+
978
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
979
+ z_in = (z * y_mask)[:, :, :max_len]
980
+
981
+ torch.onnx.export(
982
+ self.dec,
983
+ (z_in, g),
984
+ f"onnx/{path}/{path}_dec.onnx",
985
+ input_names=["z_in", "g"],
986
+ output_names=["o"],
987
+ dynamic_axes={"z_in": [0, 2], "o": [0, 2]},
988
+ verbose=True,
989
+ )
990
+ o = self.dec((z * y_mask)[:, :, :max_len], g=g)
onnx_modules/V200/text/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .symbols import *