BangDream-Bert-VITS2

Build error

App Files Files Community

Mahiruoshi commited on Oct 24, 2023

Commit

dc23363

•

1 Parent(s): f6fda8c

Upload 112 files

Browse files

Files changed (35) hide show

app.py +4 -6
bert/bert-base-japanese-v3/README.md +1 -1
bert/bert-base-japanese-v3/vocab.txt +1 -1
bert/chinese-roberta-wwm-ext-large/.gitignore +1 -0
bert/chinese-roberta-wwm-ext-large/README.md +5 -5
bert/chinese-roberta-wwm-ext-large/added_tokens.json +1 -1
bert/chinese-roberta-wwm-ext-large/special_tokens_map.json +1 -1
bert/chinese-roberta-wwm-ext-large/tokenizer.json +0 -0
bert/chinese-roberta-wwm-ext-large/tokenizer_config.json +1 -1
bert_gen.py +6 -7
configs/config.json +250 -27
data_utils.py +1 -1
filelists/esd.list +3 -0
image/41JjBPWdHtL._SX342_SY445_.jpg +0 -0
image/41JjBPWdHtL.jpg +0 -0
logs/Bangdream/G_7000.pth +3 -0
logs/Bangdream/config.json +154 -0
models.py +1 -1
monotonic_align/__pycache__/__init__.cpython-39.pyc +0 -0
monotonic_align/__pycache__/core.cpython-39.pyc +0 -0
preprocess_text.py +16 -1
requirements.txt +0 -3
text/__init__.py +0 -1
text/__pycache__/__init__.cpython-39.pyc +0 -0
text/__pycache__/chinese.cpython-39.pyc +0 -0
text/__pycache__/chinese_bert.cpython-39.pyc +0 -0
text/__pycache__/cleaner.cpython-39.pyc +0 -0
text/__pycache__/english_bert_mock.cpython-39.pyc +0 -0
text/__pycache__/japanese.cpython-39.pyc +0 -0
text/__pycache__/japanese_bert.cpython-39.pyc +0 -0
text/__pycache__/symbols.cpython-39.pyc +0 -0
text/__pycache__/tone_sandhi.cpython-39.pyc +0 -0
train_ms.py +2 -6
utils.py +3 -4
webui.py +224 -0

app.py CHANGED Viewed

@@ -216,11 +216,9 @@ WrapStyle: 0
 PlayResX: 640
 PlayResY: 360
 ScaledBorderAndShadow: yes
 [V4+ Styles]
 Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
 Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
 [Events]
 Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 """
@@ -338,7 +336,7 @@ def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-m", "--model", default="./logs/BangDream/G_43000.pth", help="path of your model"
     )
     parser.add_argument(
         "-c",
@@ -387,7 +385,7 @@ if __name__ == "__main__":
     ]
     with gr.Blocks() as app:
         gr.Markdown(
-             f"Bang Dream全员TTS,使用本模型请严格遵守法律法规!\n 发布二创作品请标注本项目作者<a href='https://space.bilibili.com/19874615/'>B站@Mahiroshi</a>及项目链接\n从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看使用说明</a>"
         )
         for band in BandList:
             with gr.TabItem(band):
@@ -444,9 +442,9 @@ if __name__ == "__main__":
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown(
-                                        f"从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看使用说明\n游戏脚本见<a href='https://bestdori.com/tool/explorer/asset/cn/scenario'>bestdori</a>"
                                     )
-                        inputFile = gr.inputs.File(label="上传游戏脚本(日文)、中文脚本(需设置角色对应关系)、自制文、(需设置角色对应关系")
                         groupSize = gr.Slider(
                         minimum=10, maximum=1000,value = i[1], step=1, label="当个音频文件包含的最大字数"
                         )

 PlayResX: 640
 PlayResY: 360
 ScaledBorderAndShadow: yes
 [V4+ Styles]
 Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
 Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
 [Events]
 Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 """
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
+        "-m", "--model", default="./logs/BangDream/G_7000.pth", help="path of your model"
     )
     parser.add_argument(
         "-c",
     ]
     with gr.Blocks() as app:
         gr.Markdown(
+             f"少歌邦邦全员TTS,使用本模型请严格遵守法律法规!\n 发布二创作品请标注本项目作者<a href='https://space.bilibili.com/19874615/'>B站@Mahiroshi</a>及项目链接\n从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看使用说明</a>"
         )
         for band in BandList:
             with gr.TabItem(band):
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown(
+                                        f"从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看自制galgame使用说明\n</a>"
                                     )
+                        inputFile = gr.inputs.File(label="上传txt(可设置角色对应表)、epub或mobi文件")
                         groupSize = gr.Slider(
                         minimum=10, maximum=1000,value = i[1], step=1, label="当个音频文件包含的最大字数"
                         )

bert/bert-base-japanese-v3/README.md CHANGED Viewed

@@ -50,4 +50,4 @@ The pretrained models are distributed under the Apache License 2.0.
 ## Acknowledgments
-This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.


50
51	## Acknowledgments
52
53	+ This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.

bert/bert-base-japanese-v3/vocab.txt CHANGED Viewed

@@ -13,7 +13,7 @@
 [unused7]
 [unused8]
 [unused9]
 !
 "
 #

 [unused7]
 [unused8]
 [unused9]
 !
 "
 #

bert/chinese-roberta-wwm-ext-large/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.bin

bert/chinese-roberta-wwm-ext-large/README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-language:
 - zh
 tags:
 - bert
@@ -9,9 +9,9 @@ license: "apache-2.0"
 # Please use 'Bert' related functions to load this model!
 ## Chinese BERT with Whole Word Masking
-For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
-**[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
 Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
 This repository is developed based on：https://github.com/google-research/bert
@@ -46,7 +46,7 @@ If you find the technical report or resource is useful, please cite the followin
     pages = "657--668",
 }
 ```
-- Secondary: https://arxiv.org/abs/1906.08101
 ```
 @article{chinese-bert-wwm,
   title={Pre-Training with Whole Word Masking for Chinese BERT},
@@ -54,4 +54,4 @@ If you find the technical report or resource is useful, please cite the followin
   journal={arXiv preprint arXiv:1906.08101},
   year={2019}
  }
-```

 ---
+language:
 - zh
 tags:
 - bert
 # Please use 'Bert' related functions to load this model!
 ## Chinese BERT with Whole Word Masking
+For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
+**[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
 Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
 This repository is developed based on：https://github.com/google-research/bert
     pages = "657--668",
 }
 ```
+- Secondary: https://arxiv.org/abs/1906.08101
 ```
 @article{chinese-bert-wwm,
   title={Pre-Training with Whole Word Masking for Chinese BERT},
   journal={arXiv preprint arXiv:1906.08101},
   year={2019}
  }
+```

bert/chinese-roberta-wwm-ext-large/added_tokens.json CHANGED Viewed

	@@ -1 +1 @@
1	- {}


1	+ {}

bert/chinese-roberta-wwm-ext-large/special_tokens_map.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

bert/chinese-roberta-wwm-ext-large/tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

bert/chinese-roberta-wwm-ext-large/tokenizer_config.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"init_inputs": []}


1	+ {"init_inputs": []}

bert_gen.py CHANGED Viewed

@@ -21,13 +21,12 @@ def process_line(line):
     word2ph = [i for i in word2ph]
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
-    if hps.data.add_blank:
-        phone = commons.intersperse(phone, 0)
-        tone = commons.intersperse(tone, 0)
-        language = commons.intersperse(language, 0)
-        for i in range(len(word2ph)):
-            word2ph[i] = word2ph[i] * 2
-        word2ph[0] += 1
     bert_path = wav_path.replace(".wav", ".bert.pt")

     word2ph = [i for i in word2ph]
     phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
+    phone = commons.intersperse(phone, 0)
+    tone = commons.intersperse(tone, 0)
+    language = commons.intersperse(language, 0)
+    for i in range(len(word2ph)):
+        word2ph[i] = word2ph[i] * 2
+    word2ph[0] += 1
     bert_path = wav_path.replace(".wav", ".bert.pt")

configs/config.json CHANGED Viewed

@@ -10,7 +10,7 @@
       0.99
     ],
     "eps": 1e-09,
-    "batch_size": 24,
     "fp16_run": false,
     "lr_decay": 0.999875,
     "segment_size": 16384,
@@ -35,31 +35,254 @@
     "n_speakers": 256,
     "cleaned_text": true,
     "spk2id": {
-      "燈": 0,
-      "そよ": 1,
-      "祥子": 2,
-      "立希": 3,
-      "睦": 4,
-      "愛音": 5,
-      "神秘人": 6,
-      "香澄": 7,
-      "沙綾": 8,
-      "楽奈": 9,
-      "一同": 10,
-      "海鈴": 11,
-      "にゃむ": 12,
-      "モカ": 13,
-      "蘭": 14,
-      "りみ": 15,
-      "有咲": 16,
-      "凛々子": 17,
-      "初華": 18,
-      "ひまり": 19,
-      "つぐみ": 20,
-      "巴": 21,
-      "ロック": 22,
-      "あこ": 23,
-      "オーナー": 24
     }
   },
   "model": {
@@ -116,4 +339,4 @@
     "use_spectral_norm": false,
     "gin_channels": 256
   }
-}

       0.99
     ],
     "eps": 1e-09,
+    "batch_size": 8,
     "fp16_run": false,
     "lr_decay": 0.999875,
     "segment_size": 16384,
     "n_speakers": 256,
     "cleaned_text": true,
     "spk2id": {
+      "丹恒": 0,
+      "克拉拉": 1,
+      "穹": 2,
+      "「信使」": 3,
+      "史瓦罗": 4,
+      "彦卿": 5,
+      "晴霓": 6,
+      "杰帕德": 7,
+      "素裳": 8,
+      "绿芙蓉": 9,
+      "罗刹": 10,
+      "艾丝妲": 11,
+      "黑塔": 12,
+      "丹枢": 13,
+      "希露瓦": 14,
+      "白露": 15,
+      "费斯曼": 16,
+      "停云": 17,
+      "可可利亚": 18,
+      "景元": 19,
+      "螺丝咕姆": 20,
+      "青镞": 21,
+      "公输师傅": 22,
+      "卡芙卡": 23,
+      "大毫": 24,
+      "驭空": 25,
+      "半夏": 26,
+      "奥列格": 27,
+      "娜塔莎": 28,
+      "桑博": 29,
+      "瓦尔特": 30,
+      "阿兰": 31,
+      "伦纳德": 32,
+      "佩拉": 33,
+      "卡波特": 34,
+      "帕姆": 35,
+      "帕斯卡": 36,
+      "青雀": 37,
+      "三月七": 38,
+      "刃": 39,
+      "姬子": 40,
+      "布洛妮娅": 41,
+      "希儿": 42,
+      "星": 43,
+      "符玄": 44,
+      "虎克": 45,
+      "银狼": 46,
+      "镜流": 47,
+      "「博士」": 48,
+      "「大肉丸」": 49,
+      "九条裟罗": 50,
+      "佐西摩斯": 51,
+      "刻晴": 52,
+      "博易": 53,
+      "卡维": 54,
+      "可莉": 55,
+      "嘉玛": 56,
+      "埃舍尔": 57,
+      "塔杰·拉德卡尼": 58,
+      "大慈树王": 59,
+      "宵宫": 60,
+      "康纳": 61,
+      "影": 62,
+      "枫原万叶": 63,
+      "欧菲妮": 64,
+      "玛乔丽": 65,
+      "珊瑚": 66,
+      "田铁嘴": 67,
+      "砂糖": 68,
+      "神里绫华": 69,
+      "罗莎莉亚": 70,
+      "荒泷一斗": 71,
+      "莎拉": 72,
+      "迪希雅": 73,
+      "钟离": 74,
+      "阿圆": 75,
+      "阿娜耶": 76,
+      "阿拉夫": 77,
+      "雷泽": 78,
+      "香菱": 79,
+      "龙二": 80,
+      "「公子」": 81,
+      "「白老先生」": 82,
+      "优菈": 83,
+      "凯瑟琳": 84,
+      "哲平": 85,
+      "夏洛蒂": 86,
+      "安柏": 87,
+      "巴达维": 88,
+      "式大将": 89,
+      "斯坦利": 90,
+      "毗伽尔": 91,
+      "海妮耶": 92,
+      "爱德琳": 93,
+      "纳西妲": 94,
+      "老孟": 95,
+      "芙宁娜": 96,
+      "阿守": 97,
+      "阿祇": 98,
+      "丹吉尔": 99,
+      "丽莎": 100,
+      "五郎": 101,
+      "元太": 102,
+      "克列门特": 103,
+      "克罗索": 104,
+      "北斗": 105,
+      "埃勒曼": 106,
+      "天目十五": 107,
+      "奥兹": 108,
+      "恶龙": 109,
+      "早柚": 110,
+      "杜拉夫": 111,
+      "松浦": 112,
+      "柊千里": 113,
+      "甘雨": 114,
+      "石头": 115,
+      "纯水精灵？": 116,
+      "羽生田千鹤": 117,
+      "莱依拉": 118,
+      "菲谢尔": 119,
+      "言笑": 120,
+      "诺艾尔": 121,
+      "赛诺": 122,
+      "辛焱": 123,
+      "迪娜泽黛": 124,
+      "那维莱特": 125,
+      "八重神子": 126,
+      "凯亚": 127,
+      "吴船长": 128,
+      "埃德": 129,
+      "天叔": 130,
+      "女士": 131,
+      "恕筠": 132,
+      "提纳里": 133,
+      "派蒙": 134,
+      "流浪者": 135,
+      "深渊使徒": 136,
+      "玛格丽特": 137,
+      "珐露珊": 138,
+      "琴": 139,
+      "瑶瑶": 140,
+      "留云借风真君": 141,
+      "绮良良": 142,
+      "舒伯特": 143,
+      "荧": 144,
+      "莫娜": 145,
+      "行秋": 146,
+      "迈勒斯": 147,
+      "阿佩普": 148,
+      "鹿野奈奈": 149,
+      "七七": 150,
+      "伊迪娅": 151,
+      "博来": 152,
+      "坎蒂丝": 153,
+      "埃尔欣根": 154,
+      "埃泽": 155,
+      "塞琉斯": 156,
+      "夜兰": 157,
+      "常九爷": 158,
+      "悦": 159,
+      "戴因斯雷布": 160,
+      "笼钓瓶一心": 161,
+      "纳比尔": 162,
+      "胡桃": 163,
+      "艾尔海森": 164,
+      "艾莉丝": 165,
+      "菲米尼": 166,
+      "蒂玛乌斯": 167,
+      "迪奥娜": 168,
+      "阿晃": 169,
+      "阿洛瓦": 170,
+      "陆行岩本真蕈·元素生命": 171,
+      "雷电将军": 172,
+      "魈": 173,
+      "鹿野院平藏": 174,
+      "「女士」": 175,
+      "「散兵」": 176,
+      "凝光": 177,
+      "妮露": 178,
+      "娜维娅": 179,
+      "宛烟": 180,
+      "慧心": 181,
+      "托克": 182,
+      "托马": 183,
+      "掇星攫辰天君": 184,
+      "旁白": 185,
+      "浮游水蕈兽·元素生命": 186,
+      "烟绯": 187,
+      "玛塞勒": 188,
+      "百闻": 189,
+      "知易": 190,
+      "米卡": 191,
+      "西拉杰": 192,
+      "迪卢克": 193,
+      "重云": 194,
+      "阿扎尔": 195,
+      "霍夫曼": 196,
+      "上杉": 197,
+      "久利须": 198,
+      "嘉良": 199,
+      "回声海螺": 200,
+      "多莉": 201,
+      "安西": 202,
+      "德沃沙克": 203,
+      "拉赫曼": 204,
+      "林尼": 205,
+      "查尔斯": 206,
+      "深渊法师": 207,
+      "温迪": 208,
+      "爱贝尔": 209,
+      "珊瑚宫心海": 210,
+      "班尼特": 211,
+      "琳妮特": 212,
+      "申鹤": 213,
+      "神里绫人": 214,
+      "艾伯特": 215,
+      "萍姥姥": 216,
+      "萨赫哈蒂": 217,
+      "萨齐因": 218,
+      "阿尔卡米": 219,
+      "阿贝多": 220,
+      "anzai": 221,
+      "久岐忍": 222,
+      "九条镰治": 223,
+      "云堇": 224,
+      "伊利亚斯": 225,
+      "埃洛伊": 226,
+      "塞塔蕾": 227,
+      "拉齐": 228,
+      "昆钧": 229,
+      "柯莱": 230,
+      "沙扎曼": 231,
+      "海芭夏": 232,
+      "白术": 233,
+      "空": 234,
+      "艾文": 235,
+      "芭芭拉": 236,
+      "莫塞伊思": 237,
+      "莺儿": 238,
+      "达达利亚": 239,
+      "迈蒙": 240,
+      "长生": 241,
+      "阿巴图伊": 242,
+      "陆景和": 243,
+      "莫弈": 244,
+      "夏彦": 245,
+      "左然": 246,
+      "标贝": 247
     }
   },
   "model": {
     "use_spectral_norm": false,
     "gin_channels": 256
   }
+}

data_utils.py CHANGED Viewed

@@ -155,7 +155,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         if language_str == "ZH":
             bert = bert
             ja_bert = torch.zeros(768, len(phone))
-        elif language_str == "JA":
             ja_bert = bert
             bert = torch.zeros(1024, len(phone))
         else:

         if language_str == "ZH":
             bert = bert
             ja_bert = torch.zeros(768, len(phone))
+        elif language_str == "JP":
             ja_bert = bert
             bert = torch.zeros(1024, len(phone))
         else:

filelists/esd.list ADDED Viewed

	@@ -0,0 +1,3 @@

+Example:
+{wav_path}|{speaker_name}|{language}|{text}
+派蒙_1.wav|派蒙|ZH|前面的区域，以后再来探索吧！

image/41JjBPWdHtL._SX342_SY445_.jpg ADDED Viewed

image/41JjBPWdHtL.jpg ADDED Viewed

logs/Bangdream/G_7000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92e3ea6239c8f2b16efff571ba07232dd5de71067d2fc87e3f2e0ef490e2d7eb
+size 857912686

logs/Bangdream/config.json ADDED Viewed

	@@ -0,0 +1,154 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 1000,
+    "seed": 52,
+    "epochs": 10000,
+    "learning_rate": 0.0003,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 16,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 16384,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "skip_optimizer": true
+  },
+  "data": {
+    "training_files": "filelists/train.list",
+    "validation_files": "filelists/val.list",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 256,
+    "cleaned_text": true,
+    "spk2id": {
+      "三月七": 0,
+      "香澄": 1,
+      "有咲": 2,
+      "沙綾": 3,
+      "りみ": 4,
+      "たえ": 5,
+      "沙綾、りみ、たえ": 6,
+      "巴": 7,
+      "一同": 8,
+      "まりな": 9,
+      "ゆり": 10,
+      "明日香": 11,
+      "？？？": 12,
+      "ひまり": 13,
+      "モカ": 14,
+      "つぐみ": 15,
+      "蘭": 16,
+      "リサ": 17,
+      "千聖": 18,
+      "花音": 19,
+      "イヴ": 20,
+      "日菜": 21,
+      "友希那": 22,
+      "紗夜": 23,
+      "こころ": 24,
+      "美咲": 25,
+      "薫": 26,
+      "はぐみ": 27,
+      "ミッシェル": 28,
+      "マリー": 29,
+      "怪盗ハロハッピー": 30,
+      "ニコリーナ": 31,
+      "彩": 32,
+      "麻弥": 33,
+      "燐子": 34,
+      "あこ": 35,
+      "ゆきな": 36,
+      "ましろ": 37,
+      "つくし": 38,
+      "透子": 39,
+      "七深": 40,
+      "瑠唯": 41,
+      "六花": 42,
+      "パレオ": 43,
+      "レイヤ": 44,
+      "マスキング": 45,
+      "チュチュ": 46,
+      "ますき": 47,
+      "ロック": 48,
+      "令王那": 49,
+      "CHIYU": 50,
+      "レイ": 51,
+      "燈": 52,
+      "そよ": 53,
+      "祥子": 54,
+      "立希": 55,
+      "睦": 56,
+      "愛音": 57,
+      "楽奈": 58,
+      "海鈴": 59
+    }
+  },
+  "model": {
+    "use_spk_conditioned_encoder": true,
+    "use_noise_scaled_mas": true,
+    "use_mel_posterior_encoder": false,
+    "use_duration_discriminator": true,
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      8,
+      2,
+      2
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  }
+}

models.py CHANGED Viewed

@@ -763,7 +763,7 @@ class SynthesizerTrn(nn.Module):
         gin_channels=256,
         use_sdp=True,
         n_flow_layer=4,
-        n_layers_trans_flow=4,
         flow_share_parameter=False,
         use_transformer_flow=True,
         **kwargs

         gin_channels=256,
         use_sdp=True,
         n_flow_layer=4,
+        n_layers_trans_flow=6,
         flow_share_parameter=False,
         use_transformer_flow=True,
         **kwargs

monotonic_align/__pycache__/__init__.cpython-39.pyc CHANGED Viewed

Binary files a/monotonic_align/__pycache__/__init__.cpython-39.pyc and b/monotonic_align/__pycache__/__init__.cpython-39.pyc differ

monotonic_align/__pycache__/core.cpython-39.pyc CHANGED Viewed

Binary files a/monotonic_align/__pycache__/core.cpython-39.pyc and b/monotonic_align/__pycache__/core.cpython-39.pyc differ

preprocess_text.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 from collections import defaultdict
 from random import shuffle
 from typing import Optional
@@ -11,7 +12,7 @@ from text.cleaner import clean_text
 @click.command()
 @click.option(
     "--transcription-path",
-    default="filelists/Mygo.list",
     type=click.Path(exists=True, file_okay=True, dir_okay=False),
 )
 @click.option("--cleaned-path", default=None)
@@ -67,13 +68,27 @@ def main(
     current_sid = 0
     with open(transcription_path, encoding="utf-8") as f:
         for line in f.readlines():
             utt, spk, language, text, phones, tones, word2ph = line.strip().split("|")
             spk_utt_map[spk].append(line)
             if spk not in spk_id_map.keys():
                 spk_id_map[spk] = current_sid
                 current_sid += 1
     train_list = []
     val_list = []

 import json
+import os.path
 from collections import defaultdict
 from random import shuffle
 from typing import Optional
 @click.command()
 @click.option(
     "--transcription-path",
+    default="filelists/genshin.list",
     type=click.Path(exists=True, file_okay=True, dir_okay=False),
 )
 @click.option("--cleaned-path", default=None)
     current_sid = 0
     with open(transcription_path, encoding="utf-8") as f:
+        audioPaths = set()
+        countSame = 0
+        countNotFound = 0
         for line in f.readlines():
             utt, spk, language, text, phones, tones, word2ph = line.strip().split("|")
+            if utt in audioPaths:
+                # 过滤数据集错误：相同的音频匹配多个文本，导致后续bert出问题
+                print(f"重复音频文本：{line}")
+                countSame += 1
+                continue
+            if not os.path.isfile(utt):
+                print(f"没有找到对应的音频：{utt}")
+                countNotFound += 1
+                continue
+            audioPaths.add(utt)
             spk_utt_map[spk].append(line)
             if spk not in spk_id_map.keys():
                 spk_id_map[spk] = current_sid
                 current_sid += 1
+        print(f"总重复音频数：{countSame}，总未找到的音频数:{countNotFound}")
     train_list = []
     val_list = []

requirements.txt CHANGED Viewed

@@ -21,6 +21,3 @@ unidic-lite
 cmudict
 fugashi
 num2words
-PyPDF2
-ebooklib
-beautifulsoup4

 cmudict
 fugashi
 num2words

text/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from text.symbols import *
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}


1	from text.symbols import *
2

3	_symbol_to_id = {s: i for i, s in enumerate(symbols)}
4
5

text/__pycache__/__init__.cpython-39.pyc CHANGED Viewed

Binary files a/text/__pycache__/__init__.cpython-39.pyc and b/text/__pycache__/__init__.cpython-39.pyc differ

text/__pycache__/chinese.cpython-39.pyc CHANGED Viewed

Binary files a/text/__pycache__/chinese.cpython-39.pyc and b/text/__pycache__/chinese.cpython-39.pyc differ

text/__pycache__/chinese_bert.cpython-39.pyc CHANGED Viewed

Binary files a/text/__pycache__/chinese_bert.cpython-39.pyc and b/text/__pycache__/chinese_bert.cpython-39.pyc differ

text/__pycache__/cleaner.cpython-39.pyc CHANGED Viewed

Binary files a/text/__pycache__/cleaner.cpython-39.pyc and b/text/__pycache__/cleaner.cpython-39.pyc differ

text/__pycache__/english_bert_mock.cpython-39.pyc CHANGED Viewed

Binary files a/text/__pycache__/english_bert_mock.cpython-39.pyc and b/text/__pycache__/english_bert_mock.cpython-39.pyc differ

text/__pycache__/japanese.cpython-39.pyc CHANGED Viewed

Binary files a/text/__pycache__/japanese.cpython-39.pyc and b/text/__pycache__/japanese.cpython-39.pyc differ

text/__pycache__/japanese_bert.cpython-39.pyc CHANGED Viewed

Binary files a/text/__pycache__/japanese_bert.cpython-39.pyc and b/text/__pycache__/japanese_bert.cpython-39.pyc differ

text/__pycache__/symbols.cpython-39.pyc CHANGED Viewed

Binary files a/text/__pycache__/symbols.cpython-39.pyc and b/text/__pycache__/symbols.cpython-39.pyc differ

text/__pycache__/tone_sandhi.cpython-39.pyc CHANGED Viewed

Binary files a/text/__pycache__/tone_sandhi.cpython-39.pyc and b/text/__pycache__/tone_sandhi.cpython-39.pyc differ

train_ms.py CHANGED Viewed

@@ -42,12 +42,6 @@ torch.backends.cuda.enable_mem_efficient_sdp(
 torch.backends.cuda.enable_math_sdp(True)
 global_step = 0
-import os
-os.environ['MASTER_ADDR'] = '127.0.0.1'
-os.environ['MASTER_PORT'] = '8880'
-os.environ['WORLD_SIZE'] = '1'
-os.environ['RANK'] = '0'
 def run():
     dist.init_process_group(
@@ -197,6 +191,8 @@ def run():
                 optim_g.param_groups[0]["initial_lr"] = g_resume_lr
             if not optim_d.param_groups[0].get("initial_lr"):
                 optim_d.param_groups[0]["initial_lr"] = d_resume_lr
         epoch_str = max(epoch_str, 1)
         global_step = (epoch_str - 1) * len(train_loader)

 torch.backends.cuda.enable_math_sdp(True)
 global_step = 0
 def run():
     dist.init_process_group(
                 optim_g.param_groups[0]["initial_lr"] = g_resume_lr
             if not optim_d.param_groups[0].get("initial_lr"):
                 optim_d.param_groups[0]["initial_lr"] = d_resume_lr
+            if not optim_dur_disc.param_groups[0].get("initial_lr"):
+                optim_dur_disc.param_groups[0]["initial_lr"] = dur_resume_lr
         epoch_str = max(epoch_str, 1)
         global_step = (epoch_str - 1) * len(train_loader)

utils.py CHANGED Viewed

@@ -206,15 +206,14 @@ def get_hparams(init=True):
     config_path = args.config
     config_save_path = os.path.join(model_dir, "config.json")
     if init:
-        with open(config_path, "r") as f:
             data = f.read()
-        with open(config_save_path, "w") as f:
             f.write(data)
     else:
-        with open(config_save_path, "r") as f:
             data = f.read()
     config = json.loads(data)
     hparams = HParams(**config)
     hparams.model_dir = model_dir
     return hparams

     config_path = args.config
     config_save_path = os.path.join(model_dir, "config.json")
     if init:
+        with open(config_path, "r", encoding="utf-8") as f:
             data = f.read()
+        with open(config_save_path, "w", encoding="utf-8") as f:
             f.write(data)
     else:
+        with open(config_save_path, "r", vencoding="utf-8") as f:
             data = f.read()
     config = json.loads(data)
     hparams = HParams(**config)
     hparams.model_dir = model_dir
     return hparams

webui.py ADDED Viewed

	@@ -0,0 +1,224 @@

+# flake8: noqa: E402
+import sys, os
+import logging
+logging.getLogger("numba").setLevel(logging.WARNING)
+logging.getLogger("markdown_it").setLevel(logging.WARNING)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+logging.getLogger("matplotlib").setLevel(logging.WARNING)
+logging.basicConfig(
+    level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s"
+)
+logger = logging.getLogger(__name__)
+import torch
+import argparse
+import commons
+import utils
+from models import SynthesizerTrn
+from text.symbols import symbols
+from text import cleaned_text_to_sequence, get_bert
+from text.cleaner import clean_text
+import gradio as gr
+import webbrowser
+import numpy as np
+net_g = None
+if sys.platform == "darwin" and torch.backends.mps.is_available():
+    device = "mps"
+    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+else:
+    device = "cuda"
+def get_text(text, language_str, hps):
+    norm_text, phone, tone, word2ph = clean_text(text, language_str)
+    phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
+    if hps.data.add_blank:
+        phone = commons.intersperse(phone, 0)
+        tone = commons.intersperse(tone, 0)
+        language = commons.intersperse(language, 0)
+        for i in range(len(word2ph)):
+            word2ph[i] = word2ph[i] * 2
+        word2ph[0] += 1
+    bert = get_bert(norm_text, word2ph, language_str, device)
+    del word2ph
+    assert bert.shape[-1] == len(phone), phone
+    if language_str == "ZH":
+        bert = bert
+        ja_bert = torch.zeros(768, len(phone))
+    elif language_str == "JP":
+        ja_bert = bert
+        bert = torch.zeros(1024, len(phone))
+    else:
+        bert = torch.zeros(1024, len(phone))
+        ja_bert = torch.zeros(768, len(phone))
+    assert bert.shape[-1] == len(
+        phone
+    ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
+    phone = torch.LongTensor(phone)
+    tone = torch.LongTensor(tone)
+    language = torch.LongTensor(language)
+    return bert, ja_bert, phone, tone, language
+def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, language):
+    global net_g
+    bert, ja_bert, phones, tones, lang_ids = get_text(text, language, hps)
+    with torch.no_grad():
+        x_tst = phones.to(device).unsqueeze(0)
+        tones = tones.to(device).unsqueeze(0)
+        lang_ids = lang_ids.to(device).unsqueeze(0)
+        bert = bert.to(device).unsqueeze(0)
+        ja_bert = ja_bert.to(device).unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+        del phones
+        speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
+        audio = (
+            net_g.infer(
+                x_tst,
+                x_tst_lengths,
+                speakers,
+                tones,
+                lang_ids,
+                bert,
+                ja_bert,
+                sdp_ratio=sdp_ratio,
+                noise_scale=noise_scale,
+                noise_scale_w=noise_scale_w,
+                length_scale=length_scale,
+            )[0][0, 0]
+            .data.cpu()
+            .float()
+            .numpy()
+        )
+        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
+        torch.cuda.empty_cache()
+        return audio
+def tts_fn(
+    text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language
+):
+    slices = text.split("|")
+    audio_list = []
+    with torch.no_grad():
+        for slice in slices:
+            audio = infer(
+                slice,
+                sdp_ratio=sdp_ratio,
+                noise_scale=noise_scale,
+                noise_scale_w=noise_scale_w,
+                length_scale=length_scale,
+                sid=speaker,
+                language=language,
+            )
+            audio_list.append(audio)
+            silence = np.zeros(hps.data.sampling_rate)  # 生成1秒的静音
+            audio_list.append(silence)  # 将静音添加到列表中
+    audio_concat = np.concatenate(audio_list)
+    return "Success", (hps.data.sampling_rate, audio_concat)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m", "--model", default="./logs/as/G_8000.pth", help="path of your model"
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        default="./configs/config.json",
+        help="path of your config file",
+    )
+    parser.add_argument(
+        "--share", default=False, help="make link public", action="store_true"
+    )
+    parser.add_argument(
+        "-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log"
+    )
+    args = parser.parse_args()
+    if args.debug:
+        logger.info("Enable DEBUG-LEVEL log")
+        logging.basicConfig(level=logging.DEBUG)
+    hps = utils.get_hparams_from_file(args.config)
+    device = (
+        "cuda:0"
+        if torch.cuda.is_available()
+        else (
+            "mps"
+            if sys.platform == "darwin" and torch.backends.mps.is_available()
+            else "cpu"
+        )
+    )
+    net_g = SynthesizerTrn(
+        len(symbols),
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        n_speakers=hps.data.n_speakers,
+        **hps.model,
+    ).to(device)
+    _ = net_g.eval()
+    _ = utils.load_checkpoint(args.model, net_g, None, skip_optimizer=True)
+    speaker_ids = hps.data.spk2id
+    speakers = list(speaker_ids.keys())
+    languages = ["ZH", "JP"]
+    with gr.Blocks() as app:
+        with gr.Row():
+            with gr.Column():
+                text = gr.TextArea(
+                    label="Text",
+                    placeholder="Input Text Here",
+                    value="吃葡萄不吐葡萄皮，不吃葡萄倒吐葡萄皮。",
+                )
+                speaker = gr.Dropdown(
+                    choices=speakers, value=speakers[0], label="Speaker"
+                )
+                sdp_ratio = gr.Slider(
+                    minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
+                )
+                noise_scale = gr.Slider(
+                    minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise Scale"
+                )
+                noise_scale_w = gr.Slider(
+                    minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise Scale W"
+                )
+                length_scale = gr.Slider(
+                    minimum=0.1, maximum=2, value=1, step=0.1, label="Length Scale"
+                )
+                language = gr.Dropdown(
+                    choices=languages, value=languages[0], label="Language"
+                )
+                btn = gr.Button("Generate!", variant="primary")
+            with gr.Column():
+                text_output = gr.Textbox(label="Message")
+                audio_output = gr.Audio(label="Output Audio")
+        btn.click(
+            tts_fn,
+            inputs=[
+                text,
+                speaker,
+                sdp_ratio,
+                noise_scale,
+                noise_scale_w,
+                length_scale,
+                language,
+            ],
+            outputs=[text_output, audio_output],
+        )
+    webbrowser.open("http://127.0.0.1:7860")
+    app.launch(share=args.share)