ChatTTS-Forge

Running

App Files Files Community

zhzluke96 commited on Jun 10

Commit

ebc4336

•

1 Parent(s): 21473c0

update

Browse files

Files changed (19) hide show

language/zh-CN.json +3 -2
modules/api/impl/google_api.py +12 -6
modules/api/impl/openai_api.py +5 -1
modules/api/impl/refiner_api.py +5 -1
modules/api/impl/ssml_api.py +15 -5
modules/api/impl/tts_api.py +38 -1
modules/generate_audio.py +5 -1
modules/normalization.py +6 -0
modules/ssml.py +0 -242
modules/ssml_parser/SSMLParser.py +9 -3
modules/utils/git.py +15 -0
modules/utils/markdown.py +7 -0
modules/webui/localization.py +9 -3
modules/webui/speaker/speaker_editor.py +147 -0
modules/webui/speaker_tab.py +3 -0
modules/webui/ssml/podcast_tab.py +38 -17
modules/webui/tts_tab.py +3 -1
modules/webui/webui_utils.py +1 -3
webui.py +1 -4

language/zh-CN.json CHANGED Viewed

@@ -57,8 +57,8 @@
   "🔊Generate speaker.pt": "🔊生成 speaker.pt",
   "Save .pt file": "保存.pt文件",
   "Save to File": "保存到文件",
-  "🎤Test voice": "🎤测试语音",
-  "Test Voice": "测试语音",
   "Current Seed": "当前种子",
   "Output Audio": "输出音频",
   "Merger": "融合",
@@ -79,6 +79,7 @@
   "README": "README",
   "readme": "readme",
   "changelog": "changelog",
   "TTS_STYLE_GUIDE": ["后缀为 _p 表示带prompt，效果更强但是影响质量"],
   "SSML_SPLITER_GUIDE": [
     "- 字数限制详见README，超过部分将截断",

   "🔊Generate speaker.pt": "🔊生成 speaker.pt",
   "Save .pt file": "保存.pt文件",
   "Save to File": "保存到文件",
+  "🎤Test voice": "🎤试语",
+  "Test Voice": "试语",
   "Current Seed": "当前种子",
   "Output Audio": "输出音频",
   "Merger": "融合",
   "README": "README",
   "readme": "readme",
   "changelog": "changelog",
+  "💼Speaker file": "💼音色文件",
   "TTS_STYLE_GUIDE": ["后缀为 _p 表示带prompt，效果更强但是影响质量"],
   "SSML_SPLITER_GUIDE": [
     "- 字数限制详见README，超过部分将截断",

modules/api/impl/google_api.py CHANGED Viewed

@@ -14,7 +14,7 @@ from modules import generate_audio as generate
 from modules.speaker import speaker_mgr
-from modules.ssml import parse_ssml
 from modules.SynthesizeSegments import (
     SynthesizeSegments,
     combine_audio_segments,
@@ -65,6 +65,8 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
     audioConfig = request.audioConfig
     # 提取参数
     language_code = voice.languageCode
     voice_name = voice.name
     infer_seed = voice.seed or 42
@@ -86,9 +88,8 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
     # TODO maybe need to change the sample rate
     sample_rate = 24000
-    # TODO 使用 speaker
-    spk = speaker_mgr.get_speaker(voice_name)
-    if spk is None:
         raise HTTPException(
             status_code=400, detail="The specified voice name is not supported."
         )
@@ -120,7 +121,8 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
         elif input.ssml:
             # 处理SSML合成逻辑
-            segments = parse_ssml(input.ssml)
             for seg in segments:
                 seg["text"] = text_normalize(seg["text"], is_end=True)
@@ -171,7 +173,11 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
         import logging
         logging.exception(e)
-        raise HTTPException(status_code=500, detail=str(e))
 def setup(app: APIManager):

 from modules.speaker import speaker_mgr
+from modules.ssml_parser.SSMLParser import create_ssml_parser
 from modules.SynthesizeSegments import (
     SynthesizeSegments,
     combine_audio_segments,
     audioConfig = request.audioConfig
     # 提取参数
+    # TODO 这个也许应该传给 normalizer
     language_code = voice.languageCode
     voice_name = voice.name
     infer_seed = voice.seed or 42
     # TODO maybe need to change the sample rate
     sample_rate = 24000
+    # 虽然 calc_spk_style 可以解析 seed 形式，但是这个接口只准备支持 speakers list 中存在的 speaker
+    if speaker_mgr.get_speaker(voice_name) is None:
         raise HTTPException(
             status_code=400, detail="The specified voice name is not supported."
         )
         elif input.ssml:
             # 处理SSML合成逻辑
+            parser = create_ssml_parser()
+            segments = parser.parse(input.ssml)
             for seg in segments:
                 seg["text"] = text_normalize(seg["text"], is_end=True)
         import logging
         logging.exception(e)
+        if isinstance(e, HTTPException):
+            raise e
+        else:
+            raise HTTPException(status_code=500, detail=str(e))
 def setup(app: APIManager):

modules/api/impl/openai_api.py CHANGED Viewed

@@ -115,7 +115,11 @@ async def openai_speech_api(
         import logging
         logging.exception(e)
-        raise HTTPException(status_code=500, detail=str(e))
 class TranscribeSegment(BaseModel):

         import logging
         logging.exception(e)
+        if isinstance(e, HTTPException):
+            raise e
+        else:
+            raise HTTPException(status_code=500, detail=str(e))
 class TranscribeSegment(BaseModel):

modules/api/impl/refiner_api.py CHANGED Viewed

@@ -42,7 +42,11 @@ async def refiner_prompt_post(request: RefineTextRequest):
         import logging
         logging.exception(e)
-        raise HTTPException(status_code=500, detail=str(e))
 def setup(api_manager: APIManager):

         import logging
         logging.exception(e)
+        if isinstance(e, HTTPException):
+            raise e
+        else:
+            raise HTTPException(status_code=500, detail=str(e))
 def setup(api_manager: APIManager):

modules/api/impl/ssml_api.py CHANGED Viewed

@@ -7,7 +7,7 @@ from fastapi.responses import FileResponse
 from modules.normalization import text_normalize
-from modules.ssml import parse_ssml
 from modules.SynthesizeSegments import (
     SynthesizeSegments,
     combine_audio_segments,
@@ -34,7 +34,7 @@ async def synthesize_ssml(
 ):
     try:
         ssml = request.ssml
-        format = request.format
         batch_size = request.batch_size
         if batch_size < 1:
@@ -42,10 +42,16 @@ async def synthesize_ssml(
                 status_code=400, detail="Batch size must be greater than 0."
             )
-        if not ssml:
             raise HTTPException(status_code=400, detail="SSML content is required.")
-        segments = parse_ssml(ssml)
         for seg in segments:
             seg["text"] = text_normalize(seg["text"], is_end=True)
@@ -63,7 +69,11 @@ async def synthesize_ssml(
         import logging
         logging.exception(e)
-        raise HTTPException(status_code=500, detail=str(e))
 def setup(api_manager: APIManager):

 from modules.normalization import text_normalize
+from modules.ssml_parser.SSMLParser import create_ssml_parser
 from modules.SynthesizeSegments import (
     SynthesizeSegments,
     combine_audio_segments,
 ):
     try:
         ssml = request.ssml
+        format = request.format.lower()
         batch_size = request.batch_size
         if batch_size < 1:
                 status_code=400, detail="Batch size must be greater than 0."
             )
+        if not ssml or ssml == "":
             raise HTTPException(status_code=400, detail="SSML content is required.")
+        if format not in ["mp3", "wav"]:
+            raise HTTPException(
+                status_code=400, detail="Format must be 'mp3' or 'wav'."
+            )
+        parser = create_ssml_parser()
+        segments = parser.parse(ssml)
         for seg in segments:
             seg["text"] = text_normalize(seg["text"], is_end=True)
         import logging
         logging.exception(e)
+        if isinstance(e, HTTPException):
+            raise e
+        else:
+            raise HTTPException(status_code=500, detail=str(e))
 def setup(api_manager: APIManager):

modules/api/impl/tts_api.py CHANGED Viewed

@@ -44,6 +44,39 @@ class TTSParams(BaseModel):
 async def synthesize_tts(params: TTSParams = Depends()):
     try:
         text = text_normalize(params.text, is_end=False)
         calc_params = api_utils.calc_spk_style(spk=params.spk, style=params.style)
@@ -87,7 +120,11 @@ async def synthesize_tts(params: TTSParams = Depends()):
         import logging
         logging.exception(e)
-        raise HTTPException(status_code=500, detail=str(e))
 def setup(api_manager: APIManager):

 async def synthesize_tts(params: TTSParams = Depends()):
     try:
+        # Validate text
+        if not params.text.strip():
+            raise HTTPException(
+                status_code=422, detail="Text parameter cannot be empty"
+            )
+        # Validate temperature
+        if not (0 <= params.temperature <= 1):
+            raise HTTPException(
+                status_code=422, detail="Temperature must be between 0 and 1"
+            )
+        # Validate top_P
+        if not (0 <= params.top_P <= 1):
+            raise HTTPException(status_code=422, detail="top_P must be between 0 and 1")
+        # Validate top_K
+        if params.top_K <= 0:
+            raise HTTPException(
+                status_code=422, detail="top_K must be a positive integer"
+            )
+        if params.top_K > 100:
+            raise HTTPException(
+                status_code=422, detail="top_K must be less than or equal to 100"
+            )
+        # Validate format
+        if params.format not in ["mp3", "wav"]:
+            raise HTTPException(
+                status_code=422,
+                detail="Invalid format. Supported formats are mp3 and wav",
+            )
         text = text_normalize(params.text, is_end=False)
         calc_params = api_utils.calc_spk_style(spk=params.spk, style=params.style)
         import logging
         logging.exception(e)
+        if isinstance(e, HTTPException):
+            raise e
+        else:
+            raise HTTPException(status_code=500, detail=str(e))
 def setup(api_manager: APIManager):

modules/generate_audio.py CHANGED Viewed

@@ -79,7 +79,11 @@ def generate_audio_batch(
         params_infer_code["spk_emb"] = spk.emb
         logger.info(("spk", spk.name))
     else:
-        raise ValueError(f"spk must be int or Speaker, but: <{type(spk)}> {spk}")
     logger.info(
         {

         params_infer_code["spk_emb"] = spk.emb
         logger.info(("spk", spk.name))
     else:
+        logger.warn(
+            f"spk must be int or Speaker, but: <{type(spk)}> {spk}, wiil set to default voice"
+        )
+        with SeedContext(2, True):
+            params_infer_code["spk_emb"] = chat_tts.sample_random_speaker()
     logger.info(
         {

modules/normalization.py CHANGED Viewed

@@ -5,6 +5,10 @@ from modules.utils.markdown import markdown_to_text
 from modules import models
 import re
 @lru_cache(maxsize=64)
 def is_chinese(text):
@@ -159,6 +163,8 @@ def replace_unk_tokens(text):
     """
     把不在字典里的字符替换为 " , "
     """
     chat_tts = models.load_chat_tts()
     if "tokenizer" not in chat_tts.pretrain_models:
         # 这个地方只有在 huggingface spaces 中才会触发

 from modules import models
 import re
+# 是否关闭 unk token 检查
+# NOTE: 单测的时候用于跳过模型加载
+DISABLE_UNK_TOKEN_CHECK = False
 @lru_cache(maxsize=64)
 def is_chinese(text):
     """
     把不在字典里的字符替换为 " , "
     """
+    if DISABLE_UNK_TOKEN_CHECK:
+        return text
     chat_tts = models.load_chat_tts()
     if "tokenizer" not in chat_tts.pretrain_models:
         # 这个地方只有在 huggingface spaces 中才会触发

modules/ssml.py CHANGED Viewed

@@ -66,245 +66,3 @@ def apply_random_seed(attrs: dict):
         seed = random.randint(0, 2**32 - 1)
         attrs["seed"] = seed
         logger.info(f"random seed: {seed}")
-class NotSupportSSML(Exception):
-    pass
-def parse_ssml(ssml: str) -> List[Dict[str, Any]]:
-    root = etree.fromstring(ssml)
-    ssml_version = root.get("version", "NONE")
-    if ssml_version != "0.1":
-        raise NotSupportSSML("Unsupported ssml version: {ssml_version}")
-    segments = []
-    for voice in root.findall(".//voice"):
-        voice_attrs = {
-            "spk": voice.get("spk"),
-            "style": voice.get("style"),
-            "seed": voice.get("seed"),
-            "top_p": voice.get("top_p"),
-            "top_k": voice.get("top_k"),
-            "temp": voice.get("temp"),
-            "prompt1": voice.get("prompt1"),
-            "prompt2": voice.get("prompt2"),
-            "prefix": voice.get("prefix"),
-            "normalize": voice.get("normalize"),
-        }
-        voice_attrs = {k: v for k, v in voice_attrs.items() if v is not None}
-        expand_spk(voice_attrs)
-        expand_style(voice_attrs)
-        merge_prompt(voice_attrs, voice)
-        apply_random_seed(voice_attrs)
-        voice_segments = []
-        if voice_attrs.get("temp", "") == "min":
-            # ref: https://github.com/2noise/ChatTTS/issues/123#issue-2326908144
-            voice_attrs["temp"] = 0.000000000001
-        if voice_attrs.get("temp", "") == "max":
-            voice_attrs["temp"] = 1
-        # 处理 voice 开头的文本
-        if voice.text and voice.text.strip():
-            voice_segments.append(
-                {"text": voice.text.strip(), "attrs": voice_attrs.copy()}
-            )
-        # 处理 voice 内部的文本和 prosody 元素
-        for node in voice.iterchildren():
-            if node.tag == "prosody":
-                prosody_attrs = voice_attrs.copy()
-                new_attrs = {
-                    "rate": node.get("rate"),
-                    "volume": node.get("volume"),
-                    "pitch": node.get("pitch"),
-                }
-                prosody_attrs.update(
-                    {k: v for k, v in new_attrs.items() if v is not None}
-                )
-                expand_style(prosody_attrs)
-                merge_prompt(prosody_attrs, node)
-                apply_random_seed(voice_attrs)
-                if node.text and node.text.strip():
-                    voice_segments.append(
-                        {"text": node.text.strip(), "attrs": prosody_attrs}
-                    )
-            elif node.tag == "break":
-                time_ms = int(node.get("time", "0").replace("ms", ""))
-                segment = {"break": time_ms}
-                voice_segments.append(segment)
-            if node.tail and node.tail.strip():
-                voice_segments.append(
-                    {"text": node.tail.strip(), "attrs": voice_attrs.copy()}
-                )
-        end_segment = voice_segments[-1]
-        end_segment["is_end"] = True
-        segments = segments + voice_segments
-    logger.info(f"collect len(segments): {len(segments)}")
-    # logger.info(f"segments: {json.dumps(segments, ensure_ascii=False)}")
-    return segments
-if __name__ == "__main__":
-    # 示例 SSML 输入
-    ssml1 = """
-    <speak version="0.1">
-        <voice spk="20398768" seed="42" temp="min" top_p="0.9" top_k="20">
-            电影中梁朝伟扮演的陈永仁的
-            <prosody volume="5">
-                编号27149
-            </prosody>
-            <prosody rate="2">
-                编号27149
-            </prosody>
-            <prosody pitch="-12">
-                编号27149
-            </prosody>
-            <prosody pitch="12">
-                编号27149
-            </prosody>
-        </voice>
-        <voice spk="20398768" seed="42" speed="9">
-            编号27149
-        </voice>
-        <voice spk="20398768" seed="42">
-            电影中梁朝伟扮演的陈永仁的编号27149
-        </voice>
-    </speak>
-    """
-    ssml2 = """
-    <speak version="0.1">
-        <voice spk="Bob">
-            也可以合成多角色多情感的有声 [uv_break] 书 [uv_break] ，例如：
-        </voice>
-        <voice spk="Bob">
-            黛玉冷笑道：
-        </voice>
-        <voice spk="female2">
-            我说呢，亏了绊住，不然，早就飞了来了。
-        </voice>
-        <voice spk="Bob" speed="0">
-            宝玉道：
-        </voice>
-        <voice spk="Alice">
-            “只许和你玩，替你解闷。不过偶然到他那里，就说这些闲话。”
-        </voice>
-        <voice spk="female2">
-            “好没意思的话！去不去，关我什么事儿？又没叫你替我解闷儿，还许你不理我呢”
-        </voice>
-        <voice spk="Bob">
-            说着，便赌气回房去了。
-        </voice>
-    </speak>
-    """
-    ssml22 = """
-<speak version="0.1">
-    <voice spk="Bob" style="narration-relaxed">
-        下面是一个 ChatTTS 用于合成多角色多情感的有声书示例
-    </voice>
-    <voice spk="Bob" style="narration-relaxed">
-        黛玉冷笑道：
-    </voice>
-    <voice spk="female2" style="angry">
-        我说呢 [uv_break] ，亏了绊住，不然，早就飞起来了。
-    </voice>
-    <voice spk="Bob" style="narration-relaxed">
-        宝玉道：
-    </voice>
-    <voice spk="Alice" style="unfriendly">
-        “只许和你玩 [uv_break] ，替你解闷。不过偶然到他那里，就说这些闲话。”
-    </voice>
-    <voice spk="female2" style="angry">
-        “好没意思的话！[uv_break] 去不去，关我什么事儿？ 又没叫你替我解闷儿 [uv_break]，还许你不理我呢”
-    </voice>
-    <voice spk="Bob" style="narration-relaxed">
-        说着，便赌气回房去了。
-    </voice>
-</speak>
-    """
-    ssml3 = """
-    <speak version="0.1">
-        <voice spk="Bob" style="angry">
-            “你到底在想什么？这已经是第三次了！每次我都告诉你要按时完成任务，可你总是拖延。你知道这对整个团队有多大的影响吗？！”
-        </voice>
-        <voice spk="Bob" style="assistant">
-            “你到底在想什么？这已经是第三次了！每次我都告诉你要按时完成任务，可你总是拖延。你知道这对整个团队有多大的影响吗？！”
-        </voice>
-        <voice spk="Bob" style="gentle">
-            “你到底在想什么？这已经是第三次了！每次我都告诉你要按时完成任务，可你总是拖延。你知道这对整个团队有多大的影响吗？！”
-        </voice>
-    </speak>
-    """
-    ssml4 = """
-    <speak version="0.1">
-        <voice spk="Bob" style="narration-relaxed">
-            使用 prosody 控制生成文本的语速语调和音量，示例如下
-            <prosody>
-                无任何限制将会继承父级voice配置进行生成
-            </prosody>
-            <prosody rate="1.5">
-                设置 rate 大于1表示加速，小于1为减速
-            </prosody>
-            <prosody pitch="6">
-                设置 pitch 调整音调，设置为6表示提高6个半音
-            </prosody>
-            <prosody volume="2">
-                设置 volume 调整音量，设置为2表示提高2个分贝
-            </prosody>
-            在 voice 中无prosody包裹的文本即为默认生成状态下的语音
-        </voice>
-    </speak>
-    """
-    ssml5 = """
-    <speak version="0.1">
-        <voice spk="Bob" style="narration-relaxed">
-            使用 break 标签将会简单的
-            <break time="500" />
-            插入一段空白到生成结果中
-        </voice>
-    </speak>
-    """
-    ssml6 = """
-    <speak version="0.1">
-        <voice spk="Bob" style="excited">
-            temperature for sampling (may be overridden by style or speaker)
-            <break time="500" />
-            温度值用于采样，这个值有可能被 style 或者 speaker 覆盖
-            <break time="500" />
-            temperature for sampling ，这个值有可能被 style 或者 speaker 覆盖
-            <break time="500" />
-            温度值用于采样，(may be overridden by style or speaker)
-        </voice>
-    </speak>
-    """
-    segments = parse_ssml(ssml6)
-    print(segments)
-    # audio_segments = synthesize_segments(segments)
-    # combined_audio = combine_audio_segments(audio_segments)
-    # combined_audio.export("output.wav", format="wav")

         seed = random.randint(0, 2**32 - 1)
         attrs["seed"] = seed
         logger.info(f"random seed: {seed}")

modules/ssml_parser/SSMLParser.py CHANGED Viewed

@@ -29,6 +29,12 @@ class SSMLContext(Box):
         self.prompt2 = None
         self.prefix = None
 class SSMLSegment(Box):
     def __init__(self, text: str, attrs=SSMLContext()):
@@ -84,7 +90,7 @@ def create_ssml_parser():
     @parser.resolver("speak")
     def tag_speak(element, context, segments, parser):
-        ctx = copy.deepcopy(context)
         version = element.get("version")
         if version != "0.1":
@@ -95,7 +101,7 @@ def create_ssml_parser():
     @parser.resolver("voice")
     def tag_voice(element, context, segments, parser):
-        ctx = copy.deepcopy(context)
         ctx.spk = element.get("spk", ctx.spk)
         ctx.style = element.get("style", ctx.style)
@@ -131,7 +137,7 @@ def create_ssml_parser():
     @parser.resolver("prosody")
     def tag_prosody(element, context, segments, parser):
-        ctx = copy.deepcopy(context)
         ctx.spk = element.get("spk", ctx.spk)
         ctx.style = element.get("style", ctx.style)

         self.prompt2 = None
         self.prefix = None
+    def clone(self):
+        ctx = SSMLContext()
+        for k, v in self.items():
+            ctx[k] = v
+        return ctx
 class SSMLSegment(Box):
     def __init__(self, text: str, attrs=SSMLContext()):
     @parser.resolver("speak")
     def tag_speak(element, context, segments, parser):
+        ctx = context.clone() if context is not None else SSMLContext()
         version = element.get("version")
         if version != "0.1":
     @parser.resolver("voice")
     def tag_voice(element, context, segments, parser):
+        ctx = context.clone() if context is not None else SSMLContext()
         ctx.spk = element.get("spk", ctx.spk)
         ctx.style = element.get("style", ctx.style)
     @parser.resolver("prosody")
     def tag_prosody(element, context, segments, parser):
+        ctx = context.clone() if context is not None else SSMLContext()
         ctx.spk = element.get("spk", ctx.spk)
         ctx.style = element.get("style", ctx.style)

modules/utils/git.py CHANGED Viewed

@@ -2,14 +2,25 @@ from functools import lru_cache
 import os
 import subprocess
 from modules.utils import constants
 git = os.environ.get("GIT", "git")
 @lru_cache()
 def commit_hash():
     try:
         return subprocess.check_output(
             [git, "-C", constants.ROOT_DIR, "rev-parse", "HEAD"],
             shell=False,
@@ -22,6 +33,8 @@ def commit_hash():
 @lru_cache()
 def git_tag():
     try:
         return subprocess.check_output(
             [git, "-C", constants.ROOT_DIR, "describe", "--tags"],
             shell=False,
@@ -44,6 +57,8 @@ def git_tag():
 @lru_cache()
 def branch_name():
     try:
         return subprocess.check_output(
             [git, "-C", constants.ROOT_DIR, "rev-parse", "--abbrev-ref", "HEAD"],
             shell=False,

 import os
 import subprocess
 from modules.utils import constants
+# 用于判断是否在hf spaces
+try:
+    import spaces
+except:
+    spaces = None
 git = os.environ.get("GIT", "git")
+in_hf_spaces = spaces is not None
 @lru_cache()
 def commit_hash():
     try:
+        if in_hf_spaces:
+            return "<hf>"
         return subprocess.check_output(
             [git, "-C", constants.ROOT_DIR, "rev-parse", "HEAD"],
             shell=False,
 @lru_cache()
 def git_tag():
     try:
+        if in_hf_spaces:
+            return "<hf>"
         return subprocess.check_output(
             [git, "-C", constants.ROOT_DIR, "describe", "--tags"],
             shell=False,
 @lru_cache()
 def branch_name():
     try:
+        if in_hf_spaces:
+            return "<hf>"
         return subprocess.check_output(
             [git, "-C", constants.ROOT_DIR, "rev-parse", "--abbrev-ref", "HEAD"],
             shell=False,

modules/utils/markdown.py CHANGED Viewed

@@ -46,6 +46,10 @@ class PlainTextRenderer(mistune.HTMLRenderer):
         # remove code
         return ""
 def markdown_to_text(markdown_text):
     renderer = PlainTextRenderer()
@@ -69,6 +73,9 @@ console.log(1)
 - 列表项 2
 - 列表项 3
 > 这是一个引用。
 `代码片段`

         # remove code
         return ""
+    def thematic_break(self) -> str:
+        # remove break
+        return "\n"
 def markdown_to_text(markdown_text):
     renderer = PlainTextRenderer()
 - 列表项 2
 - 列表项 3
+1. 第一
+2. 第二
 > 这是一个引用。
 `代码片段`

modules/webui/localization.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import json
 import os
 import gradio as gr
 current_translation = {}
 localization_root = os.path.join(
@@ -24,11 +26,15 @@ def localization_js(filename):
                         assert isinstance(v, str) or isinstance(
                             v, list
                         ), f"Value for key {k} is not a string or list"
             except Exception as e:
-                print(str(e))
-                print(f"Failed to load localization file {full_name}")
         else:
-            print(f"Localization file {full_name} not found")
     # current_translation = {k: 'XXX' for k in current_translation.keys()}  # use this to see if all texts are covered

 import json
 import os
 import gradio as gr
+import logging
+logger = logging.getLogger(__name__)
 current_translation = {}
 localization_root = os.path.join(
                         assert isinstance(v, str) or isinstance(
                             v, list
                         ), f"Value for key {k} is not a string or list"
+                    logger.info(f"Loaded localization file {full_name}")
             except Exception as e:
+                logger.warning(str(e))
+                logger.warning(f"Failed to load localization file {full_name}")
         else:
+            logger.warning(f"Localization file {full_name} does not exist")
+    else:
+        logger.warning(f"Localization file {filename} is not a string")
     # current_translation = {k: 'XXX' for k in current_translation.keys()}  # use this to see if all texts are covered

modules/webui/speaker/speaker_editor.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import gradio as gr
+import torch
+from modules.speaker import Speaker
+from modules.hf import spaces
+from modules.webui import webui_config
+from modules.webui.webui_utils import tts_generate
+import tempfile
+@torch.inference_mode()
+@spaces.GPU
+def test_spk_voice(spk_file, text: str):
+    if spk_file == "" or spk_file is None:
+        return None
+    spk = Speaker.from_file(spk_file)
+    return tts_generate(
+        spk=spk,
+        text=text,
+    )
+def speaker_editor_ui():
+    def on_generate(spk_file, name, gender, desc):
+        spk: Speaker = Speaker.from_file(spk_file)
+        spk.name = name
+        spk.gender = gender
+        spk.desc = desc
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pt") as tmp_file:
+            torch.save(spk, tmp_file)
+            tmp_file_path = tmp_file.name
+        return tmp_file_path
+    def create_test_voice_card(spk_file):
+        with gr.Group():
+            gr.Markdown("🎤Test voice")
+            with gr.Row():
+                test_voice_btn = gr.Button(
+                    "Test Voice", variant="secondary", interactive=False
+                )
+                with gr.Column(scale=4):
+                    test_text = gr.Textbox(
+                        label="Test Text",
+                        placeholder="Please input test text",
+                        value=webui_config.localization.DEFAULT_SPEAKER_TEST_TEXT,
+                    )
+                    with gr.Row():
+                        with gr.Column(scale=4):
+                            output_audio = gr.Audio(label="Output Audio", format="mp3")
+        test_voice_btn.click(
+            fn=test_spk_voice,
+            inputs=[spk_file, test_text],
+            outputs=[output_audio],
+        )
+        return test_voice_btn
+    has_file = gr.State(False)
+    # TODO 也许需要写个说明？
+    # gr.Markdown("SPEAKER_CREATOR_GUIDE")
+    with gr.Row():
+        with gr.Column(scale=2):
+            with gr.Group():
+                gr.Markdown("💼Speaker file")
+                spk_file = gr.File(label="*.pt file", file_types=[".pt"])
+            with gr.Group():
+                gr.Markdown("ℹ️Speaker info")
+                name_input = gr.Textbox(
+                    label="Name",
+                    placeholder="Enter speaker name",
+                    value="*",
+                    interactive=False,
+                )
+                gender_input = gr.Textbox(
+                    label="Gender",
+                    placeholder="Enter gender",
+                    value="*",
+                    interactive=False,
+                )
+                desc_input = gr.Textbox(
+                    label="Description",
+                    placeholder="Enter description",
+                    value="*",
+                    interactive=False,
+                )
+            with gr.Group():
+                gr.Markdown("🔊Generate speaker.pt")
+                generate_button = gr.Button("Save .pt file", interactive=False)
+                output_file = gr.File(label="Save to File")
+        with gr.Column(scale=5):
+            btn1 = create_test_voice_card(spk_file=spk_file)
+            btn2 = create_test_voice_card(spk_file=spk_file)
+            btn3 = create_test_voice_card(spk_file=spk_file)
+            btn4 = create_test_voice_card(spk_file=spk_file)
+    generate_button.click(
+        fn=on_generate,
+        inputs=[spk_file, name_input, gender_input, desc_input],
+        outputs=[output_file],
+    )
+    def spk_file_change(spk_file):
+        empty = spk_file is None or spk_file == ""
+        if empty:
+            return [
+                gr.Textbox(value="*", interactive=False),
+                gr.Textbox(value="*", interactive=False),
+                gr.Textbox(value="*", interactive=False),
+                gr.Button(interactive=False),
+                gr.Button(interactive=False),
+                gr.Button(interactive=False),
+                gr.Button(interactive=False),
+                gr.Button(interactive=False),
+            ]
+        spk: Speaker = Speaker.from_file(spk_file)
+        return [
+            gr.Textbox(value=spk.name, interactive=True),
+            gr.Textbox(value=spk.gender, interactive=True),
+            gr.Textbox(value=spk.describe, interactive=True),
+            gr.Button(interactive=True),
+            gr.Button(interactive=True),
+            gr.Button(interactive=True),
+            gr.Button(interactive=True),
+            gr.Button(interactive=True),
+        ]
+    spk_file.change(
+        fn=spk_file_change,
+        inputs=[spk_file],
+        outputs=[
+            name_input,
+            gender_input,
+            desc_input,
+            generate_button,
+            btn1,
+            btn2,
+            btn3,
+            btn4,
+        ],
+    )

modules/webui/speaker_tab.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 from modules.webui.speaker.speaker_merger import create_speaker_merger
 from modules.webui.speaker.speaker_creator import speaker_creator_ui
@@ -7,6 +8,8 @@ from modules.webui.speaker.speaker_creator import speaker_creator_ui
 def create_speaker_panel():
     with gr.Tabs():
         with gr.TabItem("Creator"):
             speaker_creator_ui()
         with gr.TabItem("Merger"):

 import gradio as gr
+from modules.webui.speaker.speaker_editor import speaker_editor_ui
 from modules.webui.speaker.speaker_merger import create_speaker_merger
 from modules.webui.speaker.speaker_creator import speaker_creator_ui
 def create_speaker_panel():
     with gr.Tabs():
+        with gr.Tab("Editor"):
+            speaker_editor_ui()
         with gr.TabItem("Creator"):
             speaker_creator_ui()
         with gr.TabItem("Merger"):

modules/webui/ssml/podcast_tab.py CHANGED Viewed

@@ -7,45 +7,65 @@ from modules.webui import webui_utils
 from modules.hf import spaces
 podcast_default_case = [
-    [1, "female2", "你好，欢迎收听今天的播客内容。今天我们要聊的是中华料理。", "chat"],
-    [2, "Alice", "嗨，我特别期待这个话题！中华料理真的是博大精深。", "chat"],
     [
         3,
         "Bob",
-        "没错，中华料理有着几千年的历史，而且每个地区都有自己的特色菜。",
-        "chat",
     ],
     [
         4,
         "female2",
-        "那我们先从最有名的川菜开始吧。川菜以其麻辣著称，是很多人的最爱。",
-        "chat",
     ],
     [
         5,
         "Alice",
-        "对，我特别喜欢吃麻婆豆腐和辣子鸡。那种麻辣的感觉真是让人难以忘怀。",
-        "chat",
     ],
     [
         6,
         "Bob",
-        "除了川菜，粤菜也是很受欢迎的。粤菜讲究鲜美，像是白切鸡和蒸鱼都是经典。",
-        "chat",
     ],
-    [7, "female2", "对啊，粤菜的烹饪方式比较清淡，更注重食材本身的味道。", "chat"],
-    [8, "Alice", "还有北京的京菜，像北京烤鸭，那可是来北京必吃的美食。", "chat"],
     [
         9,
         "Bob",
-        "不仅如此，还有淮扬菜、湘菜、鲁菜等等，每个菜系都有其独特的风味。",
-        "chat",
     ],
     [
         10,
         "female2",
-        "对对对，像淮扬菜的狮子头，湘菜的剁椒鱼头，都是让人垂涎三尺的美味。",
-        "chat",
     ],
 ]
@@ -111,10 +131,11 @@ def create_ssml_podcast_tab(ssml_input: gr.Textbox, tabs1: gr.Tabs, tabs2: gr.Ta
                 script_table = gr.DataFrame(
                     headers=["index", "speaker", "text", "style"],
                     datatype=["number", "str", "str", "str"],
-                    interactive=False,
                     wrap=True,
                     value=podcast_default_case,
                     row_count=(0, "dynamic"),
                 )
     send_to_ssml_btn = gr.Button("📩Send to SSML", variant="primary")

 from modules.hf import spaces
 podcast_default_case = [
+    [
+        1,
+        "female2",
+        "你好，欢迎收听今天的播客内容。今天我们要聊的是中华料理。 [lbreak]",
+        "podcast_p",
+    ],
+    [
+        2,
+        "Alice",
+        "嗨，我特别期待这个话题！中华料理真的是博大精深。 [lbreak]",
+        "podcast_p",
+    ],
     [
         3,
         "Bob",
+        "没错，中华料理有着几千年的历史，而且每个地区都有自己的特色菜。 [lbreak]",
+        "podcast_p",
     ],
     [
         4,
         "female2",
+        "那我们先从最有名的川菜开始吧。川菜以其麻辣著称，是很多人的最爱。 [lbreak]",
+        "podcast_p",
     ],
     [
         5,
         "Alice",
+        "对，我特别喜欢吃麻婆豆腐和辣子鸡。那种麻辣的感觉真是让人难以忘怀。 [lbreak]",
+        "podcast_p",
     ],
     [
         6,
         "Bob",
+        "除了川菜，粤菜也是很受欢迎的。粤菜讲究鲜美，像是白切鸡和蒸鱼都是经典。 [lbreak]",
+        "podcast_p",
+    ],
+    [
+        7,
+        "female2",
+        "对啊，粤菜的烹饪方式比较清淡，更注重食材本身的味道。 [lbreak]",
+        "podcast_p",
+    ],
+    [
+        8,
+        "Alice",
+        "还有北京的京菜，像北京烤鸭，那可是来北京必吃的美食。 [lbreak]",
+        "podcast_p",
     ],
     [
         9,
         "Bob",
+        "不仅如此，还有淮扬菜、湘菜、鲁菜等等，每个菜系都有其独特的风味。 [lbreak]",
+        "podcast_p",
     ],
     [
         10,
         "female2",
+        "对对对，像淮扬菜的狮子头，湘菜的剁椒鱼头，都是让人垂涎三尺的美味。 [lbreak]",
+        "podcast_p",
     ],
 ]
                 script_table = gr.DataFrame(
                     headers=["index", "speaker", "text", "style"],
                     datatype=["number", "str", "str", "str"],
+                    interactive=True,
                     wrap=True,
                     value=podcast_default_case,
                     row_count=(0, "dynamic"),
+                    col_count=(4, "fixed"),
                 )
     send_to_ssml_btn = gr.Button("📩Send to SSML", variant="primary")

modules/webui/tts_tab.py CHANGED Viewed

@@ -91,7 +91,9 @@ def create_tts_interface():
                             )
                         with gr.Tab(label="Upload"):
-                            spk_file_upload = gr.File(label="Speaker (Upload)")
                             gr.Markdown("📝Speaker info")
                             infos = gr.Markdown("empty")

                             )
                         with gr.Tab(label="Upload"):
+                            spk_file_upload = gr.File(
+                                label="Speaker (Upload)", file_types=[".pt"]
+                            )
                             gr.Markdown("📝Speaker info")
                             infos = gr.Markdown("empty")

modules/webui/webui_utils.py CHANGED Viewed

@@ -93,13 +93,11 @@ def apply_audio_enhance(audio_data, sr, enable_denoise, enable_enhance):
     tensor = torch.from_numpy(audio_data).float().squeeze().cpu()
     enhancer = load_enhancer(device)
-    if enable_enhance:
         lambd = 0.9 if enable_denoise else 0.1
         tensor, sr = enhancer.enhance(
             tensor, sr, tau=0.5, nfe=64, solver="rk4", lambd=lambd, device=device
         )
-    elif enable_denoise:
-        tensor, sr = enhancer.denoise(tensor, sr)
     audio_data = tensor.cpu().numpy()
     return audio_data, int(sr)

     tensor = torch.from_numpy(audio_data).float().squeeze().cpu()
     enhancer = load_enhancer(device)
+    if enable_enhance or enable_denoise:
         lambd = 0.9 if enable_denoise else 0.1
         tensor, sr = enhancer.enhance(
             tensor, sr, tau=0.5, nfe=64, solver="rk4", lambd=lambd, device=device
         )
     audio_data = tensor.cpu().numpy()
     return audio_data, int(sr)

webui.py CHANGED Viewed

@@ -84,7 +84,6 @@ if __name__ == "__main__":
     parser.add_argument(
         "--language",
         type=str,
-        default="zh-CN",
         help="Set the default language for the webui",
     )
     args = parser.parse_args()
@@ -106,7 +105,7 @@ if __name__ == "__main__":
     device_id = get_and_update_env(args, "device_id", None, str)
     use_cpu = get_and_update_env(args, "use_cpu", [], list)
     compile = get_and_update_env(args, "compile", False, bool)
-    language = get_and_update_env(args, "language", False, bool)
     webui_config.experimental = get_and_update_env(
         args, "webui_experimental", False, bool
@@ -115,8 +114,6 @@ if __name__ == "__main__":
     webui_config.ssml_max = get_and_update_env(args, "ssml_max_len", 5000, int)
     webui_config.max_batch_size = get_and_update_env(args, "max_batch_size", 8, int)
-    config.runtime_env_vars.language = "zh-CN"
     webui_init()
     demo = create_interface()

     parser.add_argument(
         "--language",
         type=str,
         help="Set the default language for the webui",
     )
     args = parser.parse_args()
     device_id = get_and_update_env(args, "device_id", None, str)
     use_cpu = get_and_update_env(args, "use_cpu", [], list)
     compile = get_and_update_env(args, "compile", False, bool)
+    language = get_and_update_env(args, "language", "zh-CN", str)
     webui_config.experimental = get_and_update_env(
         args, "webui_experimental", False, bool
     webui_config.ssml_max = get_and_update_env(args, "ssml_max_len", 5000, int)
     webui_config.max_batch_size = get_and_update_env(args, "max_batch_size", 8, int)
     webui_init()
     demo = create_interface()