ChatTTS-Forge

Running

App Files Files Community

zhzluke96 commited on Jun 5

Commit

84cfd61

•

1 Parent(s): 22884c9

update

Browse files

Files changed (5) hide show

modules/devices.py +8 -0
modules/generate_audio.py +4 -0
modules/normalization.py +38 -10
modules/utils/audio.py +10 -0
webui.py +27 -13

modules/devices.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+def torch_gc():
+    if torch.cuda.is_available():
+        with torch.cuda.device("cuda"):
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()

modules/generate_audio.py CHANGED Viewed

@@ -8,6 +8,8 @@ from modules import models, config
 import logging
 logger = logging.getLogger(__name__)
@@ -96,6 +98,8 @@ def generate_audio_batch(
     sample_rate = 24000
     return [(sample_rate, np.array(wav).flatten().astype(np.float32)) for wav in wavs]

 import logging
+from modules import devices
 logger = logging.getLogger(__name__)
     sample_rate = 24000
+    devices.torch_gc()
     return [(sample_rate, np.array(wav).flatten().astype(np.float32)) for wav in wavs]

modules/normalization.py CHANGED Viewed

@@ -75,13 +75,15 @@ character_map = {
     "“": " ",
     "’": " ",
     "”": " ",
     ":": ",",
     ";": ",",
     "!": ".",
     "(": ",",
     ")": ",",
-    # '[': ',',
-    # ']': ',',
     ">": ",",
     "<": ",",
     "-": ",",
@@ -110,13 +112,6 @@ def apply_emoji_map(text):
     return emojiswitch.demojize(text, delimiters=("", ""), lang="zh")
-@pre_normalize()
-def apply_markdown_to_text(text):
-    if is_markdown(text):
-        text = markdown_to_text(text)
-    return text
 @post_normalize()
 def insert_spaces_between_uppercase(s):
     # 使用正则表达式在每个相邻的大写字母之间插入空格
@@ -127,6 +122,29 @@ def insert_spaces_between_uppercase(s):
     )
 def ensure_suffix(a: str, b: str, c: str):
     a = a.strip()
     if not a.endswith(b):
@@ -171,6 +189,7 @@ def sentence_normalize(sentence_text: str):
         sentences = tx.normalize(part)
         dest_text = ""
         for sentence in sentences:
             dest_text += sentence
         return dest_text
@@ -197,7 +216,6 @@ def text_normalize(text, is_end=False):
     lines = [line for line in lines if line]
     lines = [sentence_normalize(line) for line in lines]
     content = "\n".join(lines)
-    content = apply_post_normalize(content)
     return content
@@ -216,6 +234,16 @@ console.log('1')
 *一条文本*
         """,
     ]
     for i, test_case in enumerate(test_cases):

     "“": " ",
     "’": " ",
     "”": " ",
+    '"': " ",
+    "'": " ",
     ":": ",",
     ";": ",",
     "!": ".",
     "(": ",",
     ")": ",",
+    "[": ",",
+    "]": ",",
     ">": ",",
     "<": ",",
     "-": ",",
     return emojiswitch.demojize(text, delimiters=("", ""), lang="zh")
 @post_normalize()
 def insert_spaces_between_uppercase(s):
     # 使用正则表达式在每个相邻的大写字母之间插入空格
     )
+@pre_normalize()
+def apply_markdown_to_text(text):
+    if is_markdown(text):
+        text = markdown_to_text(text)
+    return text
+# 将 "xxx" => \nxxx\n
+# 将 'xxx' => \nxxx\n
+@pre_normalize()
+def replace_quotes(text):
+    repl = r"\n\1\n"
+    patterns = [
+        ['"', '"'],
+        ["'", "'"],
+        ["“", "”"],
+        ["‘", "’"],
+    ]
+    for p in patterns:
+        text = re.sub(rf"({p[0]}[^{p[0]}{p[1]}]+?{p[1]})", repl, text)
+    return text
 def ensure_suffix(a: str, b: str, c: str):
     a = a.strip()
     if not a.endswith(b):
         sentences = tx.normalize(part)
         dest_text = ""
         for sentence in sentences:
+            sentence = apply_post_normalize(sentence)
             dest_text += sentence
         return dest_text
     lines = [line for line in lines if line]
     lines = [sentence_normalize(line) for line in lines]
     content = "\n".join(lines)
     return content
 *一条文本*
         """,
+        """
+在沙漠、岩石、雪地上行走了很长的时间以后，小王子终于发现了一条大路。所有的大路都是通往人住的地方的。
+“你们好。”小王子说。
+这是一个玫瑰盛开的花园。
+“你好。”玫瑰花说道。
+小王子瞅着这些花，它们全都和他的那朵花一样。
+“你们是什么花？”小王子惊奇地问。
+“我们是玫瑰花。”花儿们说道。
+“啊！”小王子说……。
+        """,
     ]
     for i, test_case in enumerate(test_cases):

modules/utils/audio.py CHANGED Viewed

@@ -5,6 +5,16 @@ import pyrubberband as pyrb
 import numpy as np
 from io import BytesIO
 def audiosegment_to_librosawav(audiosegment):
     channel_sounds = audiosegment.split_to_mono()

 import numpy as np
 from io import BytesIO
+INT16_MAX = np.iinfo(np.int16).max
+def audio_to_int16(audio_data):
+    if audio_data.dtype == np.float32:
+        audio_data = (audio_data * INT16_MAX).astype(np.int16)
+    if audio_data.dtype == np.float16:
+        audio_data = (audio_data * INT16_MAX).astype(np.int16)
+    return audio_data
 def audiosegment_to_librosawav(audiosegment):
     channel_sounds = audiosegment.split_to_mono()

webui.py CHANGED Viewed

@@ -1,4 +1,16 @@
-import spaces
 import os
 import logging
@@ -29,7 +41,7 @@ from modules.api.utils import calc_spk_style
 from modules.normalization import text_normalize
 from modules import refiner, config
-from modules.utils import env
 from modules.SentenceSplitter import SentenceSplitter
 torch._dynamo.config.cache_size_limit = 64
@@ -40,7 +52,7 @@ webui_config = {
     "tts_max": 1000,
     "ssml_max": 5000,
     "spliter_threshold": 100,
-    "max_batch_size": 12,
 }
@@ -65,7 +77,7 @@ def segments_length_limit(segments, total_max: int):
 @torch.inference_mode()
 @spaces.GPU
-def synthesize_ssml(ssml: str, batch_size=8):
     try:
         batch_size = int(batch_size)
     except Exception:
@@ -92,7 +104,10 @@ def synthesize_ssml(ssml: str, batch_size=8):
     buffer.seek(0)
-    return buffer.read()
 @torch.inference_mode()
@@ -110,12 +125,12 @@ def tts_generate(
     prefix,
     style,
     disable_normalize=False,
-    batch_size=8,
 ):
     try:
         batch_size = int(batch_size)
     except Exception:
-        batch_size = 8
     max_len = webui_config["tts_max"]
     text = text.strip()[0:max_len]
@@ -157,8 +172,6 @@ def tts_generate(
             prompt2=prompt2,
             prefix=prefix,
         )
-        return sample_rate, audio_data
     else:
         spliter = SentenceSplitter(webui_config["spliter_threshold"])
         sentences = spliter.parse(text)
@@ -178,7 +191,8 @@ def tts_generate(
         sample_rate = audio_data_batch[0][0]
         audio_data = np.concatenate([data for _, data in audio_data_batch])
-        return sample_rate, audio_data
 @torch.inference_mode()
@@ -366,7 +380,7 @@ def create_tts_interface():
                 batch_size_input = gr.Slider(
                     1,
                     webui_config["max_batch_size"],
-                    value=8,
                     step=1,
                     label="Batch Size",
                 )
@@ -593,7 +607,7 @@ def create_ssml_interface():
                 # batch size
                 batch_size_input = gr.Slider(
                     label="Batch Size",
-                    value=8,
                     minimum=1,
                     maximum=webui_config["max_batch_size"],
                     step=1,
@@ -892,7 +906,7 @@ if __name__ == "__main__":
     webui_config["tts_max"] = env.get_env_or_arg(args, "tts_max_len", 1000, int)
     webui_config["ssml_max"] = env.get_env_or_arg(args, "ssml_max_len", 5000, int)
-    webui_config["max_batch_size"] = env.get_env_or_arg(args, "max_batch_size", 12, int)
     demo = create_interface()

+try:
+    import spaces
+except:
+    class NoneSpaces:
+        def __init__(self):
+            pass
+        def GPU(self, fn):
+            return fn
+    spaces = NoneSpaces()
 import os
 import logging
 from modules.normalization import text_normalize
 from modules import refiner, config
+from modules.utils import env, audio
 from modules.SentenceSplitter import SentenceSplitter
 torch._dynamo.config.cache_size_limit = 64
     "tts_max": 1000,
     "ssml_max": 5000,
     "spliter_threshold": 100,
+    "max_batch_size": 8,
 }
 @torch.inference_mode()
 @spaces.GPU
+def synthesize_ssml(ssml: str, batch_size=4):
     try:
         batch_size = int(batch_size)
     except Exception:
     buffer.seek(0)
+    audio_data = buffer.read()
+    audio_data = audio.audio_to_int16(audio_data)
+    return audio_data
 @torch.inference_mode()
     prefix,
     style,
     disable_normalize=False,
+    batch_size=4,
 ):
     try:
         batch_size = int(batch_size)
     except Exception:
+        batch_size = 4
     max_len = webui_config["tts_max"]
     text = text.strip()[0:max_len]
             prompt2=prompt2,
             prefix=prefix,
         )
     else:
         spliter = SentenceSplitter(webui_config["spliter_threshold"])
         sentences = spliter.parse(text)
         sample_rate = audio_data_batch[0][0]
         audio_data = np.concatenate([data for _, data in audio_data_batch])
+    audio_data = audio.audio_to_int16(audio_data)
+    return sample_rate, audio_data
 @torch.inference_mode()
                 batch_size_input = gr.Slider(
                     1,
                     webui_config["max_batch_size"],
+                    value=4,
                     step=1,
                     label="Batch Size",
                 )
                 # batch size
                 batch_size_input = gr.Slider(
                     label="Batch Size",
+                    value=4,
                     minimum=1,
                     maximum=webui_config["max_batch_size"],
                     step=1,
     webui_config["tts_max"] = env.get_env_or_arg(args, "tts_max_len", 1000, int)
     webui_config["ssml_max"] = env.get_env_or_arg(args, "ssml_max_len", 5000, int)
+    webui_config["max_batch_size"] = env.get_env_or_arg(args, "max_batch_size", 8, int)
     demo = create_interface()