ChatTTS-Forge

Running

App Files Files Community

zhzluke96 commited on Jun 27

Commit

f367757

•

1 Parent(s): bb4ceb3

update

Browse files

Files changed (9) hide show

modules/SentenceSplitter.py +75 -67
modules/models.py +7 -0
modules/utils/audio.py +1 -1
modules/utils/html.py +18 -2
modules/webui/ssml/podcast_tab.py +15 -3
modules/webui/ssml/spliter_tab.py +36 -8
modules/webui/webui_utils.py +5 -4
requirements.txt +2 -1
webui.py +0 -8

modules/SentenceSplitter.py CHANGED Viewed

@@ -2,87 +2,95 @@ import re
 import zhon
 from modules.utils.detect_lang import guess_lang
-def split_zhon_sentence(text):
-    result = []
-    pattern = re.compile(zhon.hanzi.sentence)
-    start = 0
-    for match in pattern.finditer(text):
-        # 获取匹配的中文句子
-        end = match.end()
-        result.append(text[start:end])
-        start = end
-    # 最后一个中文句子后面的内容（如果有）也需要添加到结果中
-    if start < len(text):
-        result.append(text[start:])
-    result = [t for t in result if t.strip()]
-    return result
-def split_en_sentence(text):
-    """
-    Split English text into sentences.
-    """
-    # Define a regex pattern for English sentence splitting
-    pattern = re.compile(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s")
-    result = pattern.split(text)
-    # Filter out any empty strings or strings that are just whitespace
-    result = [sentence.strip() for sentence in result if sentence.strip()]
-    return result
-def is_eng_sentence(text):
-    return guess_lang(text) == "en"
-def split_zhon_paragraph(text):
-    lines = text.split("\n")
-    result = []
-    for line in lines:
-        if is_eng_sentence(line):
-            result.extend(split_en_sentence(line))
-        else:
-            result.extend(split_zhon_sentence(line))
-    return result
-# 解析文本 并根据停止符号分割成句子
-# 可以设置最大阈值，即如果分割片段小于这个阈值会与下一段合并
-class SentenceSplitter:
-    def __init__(self, threshold=100):
-        self.sentence_threshold = threshold
-    def parse(self, text):
-        sentences = split_zhon_paragraph(text)
-        # 合并小于最大阈值的片段
-        merged_sentences = []
-        temp_sentence = []
-        for sentence in sentences:
-            if len(sentence) < self.sentence_threshold:
-                temp_sentence.extend(sentence)
-                if len(temp_sentence) >= self.sentence_threshold:
-                    merged_sentences.append(temp_sentence)
-                    temp_sentence = []
             else:
-                if temp_sentence:
-                    merged_sentences.append(temp_sentence)
-                    temp_sentence = []
-                merged_sentences.append(sentence)
         if temp_sentence:
             merged_sentences.append(temp_sentence)
-        joind_sentences = [
-            "".join(sentence) for sentence in merged_sentences if sentence
-        ]
-        return joind_sentences
 if __name__ == "__main__":

 import zhon
+from modules.models import get_tokenizer
 from modules.utils.detect_lang import guess_lang
+# 解析文本 并根据停止符号分割成句子
+# 可以设置最大阈值，即如果分割片段小于这个阈值会与下一段合并
+class SentenceSplitter:
+    SEP_TOKEN = " "
+    def __init__(self, threshold=100):
+        assert (
+            isinstance(threshold, int) and threshold > 0
+        ), "Threshold must be greater than 0."
+        self.sentence_threshold = threshold
+        self.tokenizer = get_tokenizer()
+    def count_tokens(self, text: str):
+        return len(self.tokenizer.tokenize(text))
+    def parse(self, text: str):
+        sentences = self.split_paragraph(text)
+        sentences = self.merge_text_by_threshold(sentences)
+        return sentences
+    def merge_text_by_threshold(self, setences: list[str]):
+        """
+        Merge text by threshold.
+        If the length of the text is less than the threshold, merge it with the next text.
+        """
+        merged_sentences: list[str] = []
+        temp_sentence = ""
+        for sentence in setences:
+            if len(temp_sentence) + len(sentence) < self.sentence_threshold:
+                temp_sentence += SentenceSplitter.SEP_TOKEN + sentence
             else:
+                merged_sentences.append(temp_sentence)
+                temp_sentence = sentence
         if temp_sentence:
             merged_sentences.append(temp_sentence)
+        return merged_sentences
+    def split_paragraph(self, text: str):
+        """
+        Split text into sentences.
+        """
+        lines = text.split("\n")
+        sentences: list[str] = []
+        for line in lines:
+            if self.is_eng_sentence(line):
+                sentences.extend(self.split_en_sentence(line))
+            else:
+                sentences.extend(self.split_zhon_sentence(line))
+        return sentences
+    def is_eng_sentence(self, text: str):
+        return guess_lang(text) == "en"
+    def split_en_sentence(self, text: str):
+        """
+        Split English text into sentences.
+        """
+        pattern = re.compile(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s")
+        sentences = pattern.split(text)
+        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
+        return sentences
+    def split_zhon_sentence(self, text: str):
+        """
+        Split Chinese text into sentences.
+        """
+        sentences: list[str] = []
+        pattern = re.compile(zhon.hanzi.sentence)
+        start = 0
+        for match in pattern.finditer(text):
+            end = match.end()
+            sentences.append(text[start:end])
+            start = end
+        if start < len(text):
+            sentences.append(text[start:])
+        sentences = [t for t in sentences if t.strip()]
+        return sentences
 if __name__ == "__main__":

modules/models.py CHANGED Viewed

@@ -3,6 +3,7 @@ import logging
 import threading
 import torch
 from modules import config
 from modules.ChatTTS import ChatTTS
@@ -76,3 +77,9 @@ def reload_chat_tts():
     instance = load_chat_tts()
     logger.info("ChatTTS models reloaded")
     return instance

 import threading
 import torch
+from transformers import LlamaTokenizer
 from modules import config
 from modules.ChatTTS import ChatTTS
     instance = load_chat_tts()
     logger.info("ChatTTS models reloaded")
     return instance
+def get_tokenizer() -> LlamaTokenizer:
+    chat_tts = load_chat_tts()
+    tokenizer = chat_tts.pretrain_models["tokenizer"]
+    return tokenizer

modules/utils/audio.py CHANGED Viewed

@@ -2,9 +2,9 @@ import sys
 from io import BytesIO
 import numpy as np
 import soundfile as sf
 from pydub import AudioSegment, effects
-import pyrubberband as pyrb
 INT16_MAX = np.iinfo(np.int16).max

 from io import BytesIO
 import numpy as np
+import pyrubberband as pyrb
 import soundfile as sf
 from pydub import AudioSegment, effects
 INT16_MAX = np.iinfo(np.int16).max

modules/utils/html.py CHANGED Viewed

@@ -1,6 +1,10 @@
 from html.parser import HTMLParser
 class HTMLTagRemover(HTMLParser):
     def __init__(self):
         super().__init__()
@@ -20,7 +24,19 @@ def remove_html_tags(text):
     return parser.get_data()
 if __name__ == "__main__":
-    input_text = "<h1>一个标题</h1> 这是一段包含<code>标签</code>的文本。"
-    output_text = remove_html_tags(input_text)
     print(output_text)  # 输出： 一个标题 这是一段包含标签的文本。

+import html
+import re
 from html.parser import HTMLParser
+# NOTE: 现在没用这个，因为不好解决转义字符的问题
+#       除非分段处理，但是太麻烦了...
 class HTMLTagRemover(HTMLParser):
     def __init__(self):
         super().__init__()
     return parser.get_data()
+def remove_html_tags_re(text):
+    text = html.unescape(text)
+    html_tags_pattern = re.compile(r"</?([a-zA-Z1-9]+)[^>]*>")
+    return re.sub(html_tags_pattern, " ", text)
 if __name__ == "__main__":
+    input_text = """
+<h1>一个标题</h1> 这是一段包含<code>标签</code>的文本。 <code>&amp;</code>
+<设定>
+一些文本
+</设定>
+"""
+    # input_text = "我&你"
+    output_text = remove_html_tags_re(input_text)
     print(output_text)  # 输出： 一个标题 这是一段包含标签的文本。

modules/webui/ssml/podcast_tab.py CHANGED Viewed

@@ -19,13 +19,18 @@ def merge_dataframe_to_ssml(msg, spk, style, df: pd.DataFrame):
         spk = row.get("speaker")
         style = row.get("style")
         ssml += f"{indent}<voice"
         if spk:
             ssml += f' spk="{spk}"'
         if style:
             ssml += f' style="{style}"'
         ssml += ">\n"
-        ssml += f"{indent}{indent}{text_normalize(text)}\n"
         ssml += f"{indent}</voice>\n"
     # 原封不动输出回去是为了触发 loadding 效果
     return msg, spk, style, f"<speak version='0.1'>\n{ssml}</speak>"
@@ -42,6 +47,7 @@ def create_ssml_podcast_tab(ssml_input: gr.Textbox, tabs1: gr.Tabs, tabs2: gr.Ta
     with gr.Row():
         with gr.Column(scale=1):
             with gr.Group():
                 spk_input_dropdown = gr.Dropdown(
                     choices=get_spk_choices(),
                     interactive=True,
@@ -55,13 +61,19 @@ def create_ssml_podcast_tab(ssml_input: gr.Textbox, tabs1: gr.Tabs, tabs2: gr.Ta
                     show_label=False,
                     value="*auto",
                 )
             with gr.Group():
                 msg = gr.Textbox(
-                    lines=5, label="Message", placeholder="Type speaker message here"
                 )
                 add = gr.Button("Add")
                 undo = gr.Button("Undo")
                 clear = gr.Button("Clear")
         with gr.Column(scale=5):
             with gr.Group():
                 gr.Markdown("📔Script")
@@ -75,7 +87,7 @@ def create_ssml_podcast_tab(ssml_input: gr.Textbox, tabs1: gr.Tabs, tabs2: gr.Ta
                     col_count=(4, "fixed"),
                 )
-    send_to_ssml_btn = gr.Button("📩Send to SSML", variant="primary")
     def add_message(msg, spk, style, sheet: pd.DataFrame):
         if not msg:

         spk = row.get("speaker")
         style = row.get("style")
+        text = text_normalize(text)
+        if text.strip() == "":
+            continue
         ssml += f"{indent}<voice"
         if spk:
             ssml += f' spk="{spk}"'
         if style:
             ssml += f' style="{style}"'
         ssml += ">\n"
+        ssml += f"{indent}{indent}{text}\n"
         ssml += f"{indent}</voice>\n"
     # 原封不动输出回去是为了触发 loadding 效果
     return msg, spk, style, f"<speak version='0.1'>\n{ssml}</speak>"
     with gr.Row():
         with gr.Column(scale=1):
             with gr.Group():
+                gr.Markdown("🗣️Speaker")
                 spk_input_dropdown = gr.Dropdown(
                     choices=get_spk_choices(),
                     interactive=True,
                     show_label=False,
                     value="*auto",
                 )
             with gr.Group():
+                gr.Markdown("📝Text Input")
                 msg = gr.Textbox(
+                    lines=5,
+                    label="Message",
+                    show_label=False,
+                    placeholder="Type speaker message here",
                 )
                 add = gr.Button("Add")
                 undo = gr.Button("Undo")
                 clear = gr.Button("Clear")
         with gr.Column(scale=5):
             with gr.Group():
                 gr.Markdown("📔Script")
                     col_count=(4, "fixed"),
                 )
+            send_to_ssml_btn = gr.Button("📩Send to SSML", variant="primary")
     def add_message(msg, spk, style, sheet: pd.DataFrame):
         if not msg:

modules/webui/ssml/spliter_tab.py CHANGED Viewed

@@ -22,6 +22,12 @@ def merge_dataframe_to_ssml(dataframe, spk, style, seed):
     indent = " " * 2
     for i, row in dataframe.iterrows():
         ssml += f"{indent}<voice"
         if spk:
             ssml += f' spk="{spk}"'
@@ -30,7 +36,7 @@ def merge_dataframe_to_ssml(dataframe, spk, style, seed):
         if seed:
             ssml += f' seed="{seed}"'
         ssml += ">\n"
-        ssml += f"{indent}{indent}{text_normalize(row.iloc[1])}\n"
         ssml += f"{indent}</voice>\n"
     # 原封不动输出回去是为了触发 loadding 效果
     return dataframe, spk, style, seed, f"<speak version='0.1'>\n{ssml}</speak>"
@@ -73,8 +79,9 @@ def create_spliter_tab(ssml_input, tabs1, tabs2):
                     show_label=False,
                     value="*auto",
                 )
             with gr.Group():
-                gr.Markdown("🗣️Seed")
                 infer_seed_input = gr.Number(
                     value=42,
                     label="Inference Seed",
@@ -84,10 +91,23 @@ def create_spliter_tab(ssml_input, tabs1, tabs2):
                 )
                 infer_seed_rand_button = gr.Button(
                     value="🎲",
                     variant="secondary",
                 )
-            send_btn = gr.Button("📩Send to SSML", variant="primary")
         with gr.Column(scale=3):
             with gr.Group():
@@ -102,19 +122,21 @@ def create_spliter_tab(ssml_input, tabs1, tabs2):
                 )
                 long_text_split_button = gr.Button("🔪Split Text")
-    with gr.Row():
-        with gr.Column(scale=3):
             with gr.Group():
                 gr.Markdown("🎨Output")
                 long_text_output = gr.DataFrame(
                     headers=["index", "text", "length"],
                     datatype=["number", "str", "number"],
                     elem_id="long-text-output",
-                    interactive=False,
                     wrap=True,
                     value=[],
                 )
     spk_input_dropdown.change(
         fn=lambda x: x.startswith("*") and "-1" or x.split(":")[-1].strip(),
         inputs=[spk_input_dropdown],
@@ -132,8 +154,14 @@ def create_spliter_tab(ssml_input, tabs1, tabs2):
     )
     long_text_split_button.click(
         split_long_text,
-        inputs=[long_text_input],
-        outputs=[long_text_output],
     )
     infer_seed_rand_button.click(

     indent = " " * 2
     for i, row in dataframe.iterrows():
+        text = row.iloc[1]
+        text = text_normalize(text)
+        if text.strip() == "":
+            continue
         ssml += f"{indent}<voice"
         if spk:
             ssml += f' spk="{spk}"'
         if seed:
             ssml += f' seed="{seed}"'
         ssml += ">\n"
+        ssml += f"{indent}{indent}{text}\n"
         ssml += f"{indent}</voice>\n"
     # 原封不动输出回去是为了触发 loadding 效果
     return dataframe, spk, style, seed, f"<speak version='0.1'>\n{ssml}</speak>"
                     show_label=False,
                     value="*auto",
                 )
             with gr.Group():
+                gr.Markdown("💃Inference Seed")
                 infer_seed_input = gr.Number(
                     value=42,
                     label="Inference Seed",
                 )
                 infer_seed_rand_button = gr.Button(
                     value="🎲",
+                    # tooltip="Random Seed",
                     variant="secondary",
                 )
+            with gr.Group():
+                gr.Markdown("🎛️Spliter")
+                eos_input = gr.Textbox(
+                    label="eos",
+                    value="[uv_break]",
+                )
+                spliter_thr_input = gr.Slider(
+                    label="Spliter Threshold",
+                    value=100,
+                    minimum=50,
+                    maximum=1000,
+                    step=1,
+                )
         with gr.Column(scale=3):
             with gr.Group():
                 )
                 long_text_split_button = gr.Button("🔪Split Text")
             with gr.Group():
                 gr.Markdown("🎨Output")
                 long_text_output = gr.DataFrame(
                     headers=["index", "text", "length"],
                     datatype=["number", "str", "number"],
                     elem_id="long-text-output",
+                    interactive=True,
                     wrap=True,
                     value=[],
+                    row_count=(0, "dynamic"),
+                    col_count=(3, "fixed"),
                 )
+                send_btn = gr.Button("📩Send to SSML", variant="primary")
     spk_input_dropdown.change(
         fn=lambda x: x.startswith("*") and "-1" or x.split(":")[-1].strip(),
         inputs=[spk_input_dropdown],
     )
     long_text_split_button.click(
         split_long_text,
+        inputs=[
+            long_text_input,
+            spliter_thr_input,
+            eos_input,
+        ],
+        outputs=[
+            long_text_output,
+        ],
     )
     infer_seed_rand_button.click(

modules/webui/webui_utils.py CHANGED Viewed

@@ -276,11 +276,12 @@ def refine_text(
 @torch.inference_mode()
 @spaces.GPU(duration=120)
-def split_long_text(long_text_input):
-    spliter = SentenceSplitter(webui_config.spliter_threshold)
     sentences = spliter.parse(long_text_input)
-    sentences = [text_normalize(s) for s in sentences]
     data = []
     for i, text in enumerate(sentences):
-        data.append([i, text, len(text)])
     return data

 @torch.inference_mode()
 @spaces.GPU(duration=120)
+def split_long_text(long_text_input, spliter_threshold=100, eos=""):
+    spliter = SentenceSplitter(threshold=spliter_threshold)
     sentences = spliter.parse(long_text_input)
+    sentences = [text_normalize(s) + eos for s in sentences]
     data = []
     for i, text in enumerate(sentences):
+        token_length = spliter.count_tokens(text)
+        data.append([i, text, token_length])
     return data

requirements.txt CHANGED Viewed

@@ -26,4 +26,5 @@ cn2an
 python-box
 ftfy
 librosa
-pyrubberband

 python-box
 ftfy
 librosa
+pyrubberband
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch1.12cxx11abiFALSE-cp310-cp310-linux_x86_64.whl

webui.py CHANGED Viewed

@@ -30,14 +30,6 @@ from modules.webui.app import create_interface, webui_init
 dcls_patch()
 ignore_useless_warnings()
-import subprocess
-subprocess.run(
-    "pip install flash-attn --no-build-isolation",
-    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-    shell=True,
-)
 def setup_webui_args(parser: argparse.ArgumentParser):
     parser.add_argument("--server_name", type=str, help="server name")

 dcls_patch()
 ignore_useless_warnings()
 def setup_webui_args(parser: argparse.ArgumentParser):
     parser.add_argument("--server_name", type=str, help="server name")