ChatTTS-Forge

Sleeping

App Files Files Community

zhzluke96 commited on Jun 6

Commit

02e90e4

•

1 Parent(s): d6fe286

update

Browse files

Files changed (26) hide show

modules/ChatTTS/ChatTTS/core.py +21 -5
modules/ChatTTS/ChatTTS/model/dvae.py +3 -3
modules/ChatTTS/ChatTTS/model/gpt.py +2 -3
modules/ChatTTS/ChatTTS/utils/gpu_utils.py +3 -1
modules/ChatTTS/ChatTTS/utils/infer_utils.py +5 -5
modules/SynthesizeSegments.py +2 -2
modules/api/Api.py +12 -1
modules/api/impl/google_api.py +16 -3
modules/api/impl/models_api.py +11 -0
modules/api/impl/openai_api.py +19 -7
modules/api/impl/ping_api.py +8 -0
modules/api/utils.py +6 -2
modules/config.py +2 -8
modules/devices/__init__.py +0 -0
modules/devices/devices.py +160 -0
modules/devices/mac_devices.py +42 -0
modules/generate_audio.py +33 -5
modules/models.py +24 -20
modules/normalization.py +47 -2
modules/refiner.py +1 -1
modules/speaker.py +11 -2
modules/synthesize_audio.py +2 -1
modules/utils/JsonObject.py +113 -0
modules/utils/cache.py +92 -0
modules/utils/zh_normalization/text_normlization.py +3 -3
webui.py +49 -22

modules/ChatTTS/ChatTTS/core.py CHANGED Viewed

@@ -101,13 +101,27 @@ class Chat:
         tokenizer_path: str = None,
         device: str = None,
         compile: bool = True,
     ):
         if not device:
             device = select_device(4096)
             self.logger.log(logging.INFO, f"use {device}")
         if vocos_config_path:
-            vocos = Vocos.from_hparams(vocos_config_path).to(device).eval()
             assert vocos_ckpt_path, "vocos_ckpt_path should not be None"
             vocos.load_state_dict(torch.load(vocos_ckpt_path))
             self.pretrain_models["vocos"] = vocos
@@ -115,7 +129,7 @@ class Chat:
         if dvae_config_path:
             cfg = OmegaConf.load(dvae_config_path)
-            dvae = DVAE(**cfg).to(device).eval()
             assert dvae_ckpt_path, "dvae_ckpt_path should not be None"
             dvae.load_state_dict(torch.load(dvae_ckpt_path, map_location=device))
             self.pretrain_models["dvae"] = dvae
@@ -123,7 +137,7 @@ class Chat:
         if gpt_config_path:
             cfg = OmegaConf.load(gpt_config_path)
-            gpt = GPT_warpper(**cfg).to(device).eval()
             assert gpt_ckpt_path, "gpt_ckpt_path should not be None"
             gpt.load_state_dict(torch.load(gpt_ckpt_path, map_location=device))
             if compile and "cuda" in str(device):
@@ -136,12 +150,14 @@ class Chat:
             assert os.path.exists(
                 spk_stat_path
             ), f"Missing spk_stat.pt: {spk_stat_path}"
-            self.pretrain_models["spk_stat"] = torch.load(spk_stat_path).to(device)
             self.logger.log(logging.INFO, "gpt loaded.")
         if decoder_config_path:
             cfg = OmegaConf.load(decoder_config_path)
-            decoder = DVAE(**cfg).to(device).eval()
             assert decoder_ckpt_path, "decoder_ckpt_path should not be None"
             decoder.load_state_dict(torch.load(decoder_ckpt_path, map_location=device))
             self.pretrain_models["decoder"] = decoder

         tokenizer_path: str = None,
         device: str = None,
         compile: bool = True,
+        dtype: torch.dtype = torch.float32,
+        dtype_vocos: torch.dtype = None,
+        dtype_dvae: torch.dtype = None,
+        dtype_gpt: torch.dtype = None,
+        dtype_decoder: torch.dtype = None,
     ):
         if not device:
             device = select_device(4096)
             self.logger.log(logging.INFO, f"use {device}")
+        dtype_vocos = dtype_vocos or dtype
+        dtype_dvae = dtype_dvae or dtype
+        dtype_gpt = dtype_gpt or dtype
+        dtype_decoder = dtype_decoder or dtype
         if vocos_config_path:
+            vocos = (
+                Vocos.from_hparams(vocos_config_path)
+                .to(device=device, dtype=dtype_vocos)
+                .eval()
+            )
             assert vocos_ckpt_path, "vocos_ckpt_path should not be None"
             vocos.load_state_dict(torch.load(vocos_ckpt_path))
             self.pretrain_models["vocos"] = vocos
         if dvae_config_path:
             cfg = OmegaConf.load(dvae_config_path)
+            dvae = DVAE(**cfg).to(device=device, dtype=dtype_dvae).eval()
             assert dvae_ckpt_path, "dvae_ckpt_path should not be None"
             dvae.load_state_dict(torch.load(dvae_ckpt_path, map_location=device))
             self.pretrain_models["dvae"] = dvae
         if gpt_config_path:
             cfg = OmegaConf.load(gpt_config_path)
+            gpt = GPT_warpper(**cfg).to(device=device, dtype=dtype_gpt).eval()
             assert gpt_ckpt_path, "gpt_ckpt_path should not be None"
             gpt.load_state_dict(torch.load(gpt_ckpt_path, map_location=device))
             if compile and "cuda" in str(device):
             assert os.path.exists(
                 spk_stat_path
             ), f"Missing spk_stat.pt: {spk_stat_path}"
+            self.pretrain_models["spk_stat"] = torch.load(spk_stat_path).to(
+                device=device, dtype=dtype
+            )
             self.logger.log(logging.INFO, "gpt loaded.")
         if decoder_config_path:
             cfg = OmegaConf.load(decoder_config_path)
+            decoder = DVAE(**cfg).to(device=device, dtype=dtype_decoder).eval()
             assert decoder_ckpt_path, "decoder_ckpt_path should not be None"
             decoder.load_state_dict(torch.load(decoder_ckpt_path, map_location=device))
             self.pretrain_models["decoder"] = decoder

modules/ChatTTS/ChatTTS/model/dvae.py CHANGED Viewed

@@ -143,9 +143,9 @@ class DVAE(nn.Module):
         else:
             vq_feats = inp.detach().clone()
-        temp = torch.chunk(vq_feats, 2, dim=1) # flatten trick :)
-        temp = torch.stack(temp, -1)
-        vq_feats = temp.reshape(*temp.shape[:2], -1)
         vq_feats = vq_feats.transpose(1, 2)
         dec_out = self.decoder(input=vq_feats)

         else:
             vq_feats = inp.detach().clone()
+        vq_feats = vq_feats.view(
+            (vq_feats.size(0), 2, vq_feats.size(1)//2, vq_feats.size(2)),
+        ).permute(0, 2, 3, 1).flatten(2)
         vq_feats = vq_feats.transpose(1, 2)
         dec_out = self.decoder(input=vq_feats)

modules/ChatTTS/ChatTTS/model/gpt.py CHANGED Viewed

@@ -190,6 +190,8 @@ class GPT_warpper(nn.Module):
                 attention_mask_cache[:, :attention_mask.shape[1]] = attention_mask
             for i in tqdm(range(max_new_token)):
                 model_input = self.prepare_inputs_for_generation(inputs_ids,
                     outputs.past_key_values if i!=0 else None,
@@ -250,9 +252,6 @@ class GPT_warpper(nn.Module):
                 end_idx = end_idx + (~finish).int()
-                if finish.all():
-                    break
             inputs_ids = [inputs_ids[idx, start_idx: start_idx+i] for idx, i in enumerate(end_idx.int())]
             inputs_ids = [i[:, 0] for i in inputs_ids] if infer_text else inputs_ids

                 attention_mask_cache[:, :attention_mask.shape[1]] = attention_mask
             for i in tqdm(range(max_new_token)):
+                if finish.all():
+                    continue
                 model_input = self.prepare_inputs_for_generation(inputs_ids,
                     outputs.past_key_values if i!=0 else None,
                 end_idx = end_idx + (~finish).int()
             inputs_ids = [inputs_ids[idx, start_idx: start_idx+i] for idx, i in enumerate(end_idx.int())]
             inputs_ids = [i[:, 0] for i in inputs_ids] if infer_text else inputs_ids

modules/ChatTTS/ChatTTS/utils/gpu_utils.py CHANGED Viewed

@@ -16,8 +16,10 @@ def select_device(min_memory = 2048):
         if free_memory_mb < min_memory:
             logger.log(logging.WARNING, f'GPU {selected_gpu} has {round(free_memory_mb, 2)} MB memory left.')
             device = torch.device('cpu')
     else:
         logger.log(logging.WARNING, f'No GPU found, use CPU instead')
         device = torch.device('cpu')
-    return device

         if free_memory_mb < min_memory:
             logger.log(logging.WARNING, f'GPU {selected_gpu} has {round(free_memory_mb, 2)} MB memory left.')
             device = torch.device('cpu')
+    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        device = torch.device('mps')
     else:
         logger.log(logging.WARNING, f'No GPU found, use CPU instead')
         device = torch.device('cpu')
+    return device

modules/ChatTTS/ChatTTS/utils/infer_utils.py CHANGED Viewed

@@ -101,8 +101,8 @@ character_map = {
     "!": ".",
     "(": ",",
     ")": ",",
-    # '[': ',',
-    # ']': ',',
     ">": ",",
     "<": ",",
     "-": ",",
@@ -131,11 +131,11 @@ halfwidth_2_fullwidth_map = {
     ">": "＞",
     "?": "？",
     "@": "＠",
-    # '[': '［',
     "\\": "＼",
-    # ']': '］',
     "^": "＾",
-    # '_': '＿',
     "`": "｀",
     "{": "｛",
     "|": "｜",

     "!": ".",
     "(": ",",
     ")": ",",
+    "[": ",",
+    "]": ",",
     ">": ",",
     "<": ",",
     "-": ",",
     ">": "＞",
     "?": "？",
     "@": "＠",
+    "[": "［",
     "\\": "＼",
+    "]": "］",
     "^": "＾",
+    "_": "＿",
     "`": "｀",
     "{": "｛",
     "|": "｜",

modules/SynthesizeSegments.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import numpy as np
 from pydub import AudioSegment
-from typing import Any, List, Dict
 from scipy.io.wavfile import write
 import io
 from modules.utils.audio import time_stretch, pitch_shift
@@ -211,7 +211,7 @@ def generate_audio_segment(
     return AudioSegment.from_file(byte_io, format="wav")
-def synthesize_segment(segment: Dict[str, Any]) -> AudioSegment | None:
     if "break" in segment:
         pause_segment = AudioSegment.silent(duration=segment["break"])
         return pause_segment

 import numpy as np
 from pydub import AudioSegment
+from typing import Any, List, Dict, Union
 from scipy.io.wavfile import write
 import io
 from modules.utils.audio import time_stretch, pitch_shift
     return AudioSegment.from_file(byte_io, format="wav")
+def synthesize_segment(segment: Dict[str, Any]) -> Union[AudioSegment, None]:
     if "break" in segment:
         pause_segment = AudioSegment.silent(duration=segment["break"])
         return pause_segment

modules/api/Api.py CHANGED Viewed

@@ -27,7 +27,18 @@ class APIManager:
     def __init__(self, no_docs=False, exclude_patterns=[]):
         self.app = FastAPI(
             title="ChatTTS Forge API",
-            description="ChatTTS-Forge 是一个功能强大的文本转语音生成工具，支持通过类 SSML 语法生成丰富的音频长文本，并提供全面的 API 服务，适用于各种场景。\n\nChatTTS-Forge is a powerful text-to-speech generation tool that supports generating rich audio long texts through class SSML syntax\n\n https://github.com/lenML/ChatTTS-Forge",
             version="0.1.0",
             redoc_url=None if no_docs else "/redoc",
             docs_url=None if no_docs else "/docs",

     def __init__(self, no_docs=False, exclude_patterns=[]):
         self.app = FastAPI(
             title="ChatTTS Forge API",
+            description="""
+ChatTTS-Forge 是一个功能强大的文本转语音生成工具，支持通过类 SSML 语法生成丰富的音频长文本，并提供全面的 API 服务，适用于各种场景。<br/>
+ChatTTS-Forge is a powerful text-to-speech generation tool that supports generating rich audio long texts through class SSML syntax
+项目地址: [https://github.com/lenML/ChatTTS-Forge](https://github.com/lenML/ChatTTS-Forge)
+> 所有生成音频的 POST api都无法在此页面调试，调试建议使用 playground <br/>
+> All audio generation POST APIs cannot be debugged on this page, it is recommended to use playground for debugging
+> 如果你不熟悉本系统，建议从这个一键脚本开始，在colab中尝试一下：<br/>
+> [https://colab.research.google.com/github/lenML/ChatTTS-Forge/blob/main/colab.ipynb](https://colab.research.google.com/github/lenML/ChatTTS-Forge/blob/main/colab.ipynb)
+            """,
             version="0.1.0",
             redoc_url=None if no_docs else "/redoc",
             docs_url=None if no_docs else "/docs",

modules/api/impl/google_api.py CHANGED Viewed

@@ -30,6 +30,7 @@ class SynthesisInput(BaseModel):
 class VoiceSelectionParams(BaseModel):
     languageCode: str = "ZH-CN"
     name: str = "female2"
     style: str = ""
     temperature: float = 0.3
@@ -160,6 +161,18 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
 def setup(app: APIManager):
-    app.post("/v1/google/text:synthesize", response_model=GoogleTextSynthesizeResponse)(
-        google_text_synthesize
-    )

 class VoiceSelectionParams(BaseModel):
     languageCode: str = "ZH-CN"
     name: str = "female2"
     style: str = ""
     temperature: float = 0.3
 def setup(app: APIManager):
+    app.post(
+        "/v1/text:synthesize",
+        response_model=GoogleTextSynthesizeResponse,
+        description="""
+google api document: <br/>
+[https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize](https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize)
+- 多个属性在本系统中无用仅仅是为了兼容google api
+- voice 中的 topP, topK, temperature 为本系统中的参数
+- voice.name 即 speaker name （或者speaker seed）
+- voice.seed 为 infer seed （可在webui中测试具体作用）
+- 编码格式影响的是 audioContent 的二进制格式，所以所有format都是返回带有base64数据的json
+        """,
+    )(google_text_synthesize)

modules/api/impl/models_api.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from modules.api import utils as api_utils
+from modules.api.Api import APIManager
+from modules.models import reload_chat_tts
+def setup(app: APIManager):
+    @app.get("/v1/models/reload", response_model=api_utils.BaseResponse)
+    async def reload_models():
+        # Reload models
+        reload_chat_tts()
+        return api_utils.success_response("Models reloaded")

modules/api/impl/openai_api.py CHANGED Viewed

@@ -28,11 +28,11 @@ class AudioSpeechRequest(BaseModel):
     model: str = "chattts-4w"
     voice: str = "female2"
     response_format: Literal["mp3", "wav"] = "mp3"
-    speed: int = Field(1, ge=1, le=10, description="Speed of the audio")
     style: str = ""
     # 是否开启batch合成，小于等于1表示不适用batch
     # 开启batch合成会自动分割句子
-    batch_size: int = Field(1, ge=1, le=10, description="Batch size")
     spliter_threshold: float = Field(
         100, ge=10, le=1024, description="Threshold for sentence spliter"
     )
@@ -64,8 +64,8 @@ async def openai_speech_api(
         params = api_utils.calc_spk_style(spk=voice, style=style)
         spk = params.get("spk", -1)
-        seed = params.get("seed", 42)
-        temperature = params.get("temperature", 0.3)
         prompt1 = params.get("prompt1", "")
         prompt2 = params.get("prompt2", "")
         prefix = params.get("prefix", "")
@@ -107,6 +107,18 @@ async def openai_speech_api(
 def setup(api_manager: APIManager):
-    api_manager.post("/v1/openai/audio/speech", response_class=FileResponse)(
-        openai_speech_api
-    )

     model: str = "chattts-4w"
     voice: str = "female2"
     response_format: Literal["mp3", "wav"] = "mp3"
+    speed: float = Field(1, ge=0.1, le=10, description="Speed of the audio")
     style: str = ""
     # 是否开启batch合成，小于等于1表示不适用batch
     # 开启batch合成会自动分割句子
+    batch_size: int = Field(1, ge=1, le=20, description="Batch size")
     spliter_threshold: float = Field(
         100, ge=10, le=1024, description="Threshold for sentence spliter"
     )
         params = api_utils.calc_spk_style(spk=voice, style=style)
         spk = params.get("spk", -1)
+        seed = params.get("seed", request.seed or 42)
+        temperature = params.get("temperature", request.temperature or 0.3)
         prompt1 = params.get("prompt1", "")
         prompt2 = params.get("prompt2", "")
         prefix = params.get("prefix", "")
 def setup(api_manager: APIManager):
+    api_manager.post(
+        "/v1/audio/speech",
+        response_class=FileResponse,
+        description="""
+openai api document:
+[https://platform.openai.com/docs/guides/text-to-speech](https://platform.openai.com/docs/guides/text-to-speech)
+以下属性为本系统自定义属性，不在openai文档中：
+- batch_size: 是否开启batch合成，小于等于1表示不使用batch （不推荐）
+- spliter_threshold: 开启batch合成时，句子分割的阈值
+- style: 风格
+> model 可填任意值
+        """,
+    )(openai_speech_api)

modules/api/impl/ping_api.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from modules.api import utils as api_utils
+from modules.api.Api import APIManager
+def setup(app: APIManager):
+    @app.get("/v1/ping", response_model=api_utils.BaseResponse)
+    async def ping():
+        return {"message": "ok", "data": "pong"}

modules/api/utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from pydantic import BaseModel
-from typing import Any
 import torch
@@ -36,6 +36,10 @@ class BaseResponse(BaseModel):
         }
 def wav_to_mp3(wav_data, bitrate="48k"):
     audio = AudioSegment.from_wav(
         wav_data,
@@ -51,7 +55,7 @@ def to_number(value, t, default=0):
         return default
-def calc_spk_style(spk: str | int, style: str | int):
     voice_attrs = {
         "spk": None,
         "seed": None,

 from pydantic import BaseModel
+from typing import Any, Union
 import torch
         }
+def success_response(data: Any, message: str = "Success") -> BaseResponse:
+    return BaseResponse(message=message, data=data)
 def wav_to_mp3(wav_data, bitrate="48k"):
     audio = AudioSegment.from_wav(
         wav_data,
         return default
+def calc_spk_style(spk: Union[str, int], style: Union[str, int]):
     voice_attrs = {
         "spk": None,
         "seed": None,

modules/config.py CHANGED Viewed

@@ -1,11 +1,5 @@
-enable_model_compile = False
-lru_size = 64
-args = {}
 api = None
-model_config = {"half": False}
-disable_tqdm = False

+from modules.utils.JsonObject import JsonObject
+runtime_env_vars = JsonObject({})
 api = None

modules/devices/__init__.py ADDED Viewed

File without changes

modules/devices/devices.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from functools import lru_cache
+import sys
+import torch
+from modules import config
+import logging
+logger = logging.getLogger(__name__)
+if sys.platform == "darwin":
+    from modules.devices import mac_devices
+def has_mps() -> bool:
+    if sys.platform != "darwin":
+        return False
+    else:
+        return mac_devices.has_mps
+def get_cuda_device_id():
+    return (
+        int(config.runtime_env_vars.device_id)
+        if config.runtime_env_vars.device_id is not None
+        and config.runtime_env_vars.device_id.isdigit()
+        else 0
+    ) or torch.cuda.current_device()
+def get_cuda_device_string():
+    if config.runtime_env_vars.device_id is not None:
+        return f"cuda:{config.runtime_env_vars.device_id}"
+    return "cuda"
+def get_available_gpus() -> list[tuple[int, int]]:
+    """
+    Get the list of available GPUs and their free memory.
+    :return: A list of tuples where each tuple contains (GPU index, free memory in bytes).
+    """
+    available_gpus = []
+    for i in range(torch.cuda.device_count()):
+        props = torch.cuda.get_device_properties(i)
+        free_memory = props.total_memory - torch.cuda.memory_reserved(i)
+        available_gpus.append((i, free_memory))
+    return available_gpus
+def get_memory_available_gpus(min_memory=2048):
+    available_gpus = get_available_gpus()
+    memory_available_gpus = [
+        gpu for gpu, free_memory in available_gpus if free_memory > min_memory
+    ]
+    return memory_available_gpus
+def get_target_device_id_or_memory_available_gpu():
+    memory_available_gpus = get_memory_available_gpus()
+    device_id = get_cuda_device_id()
+    if device_id not in memory_available_gpus:
+        if len(memory_available_gpus) != 0:
+            logger.warning(
+                f"Device {device_id} is not available or does not have enough memory. will try to use {memory_available_gpus}"
+            )
+            config.runtime_env_vars.device_id = str(memory_available_gpus[0])
+        else:
+            logger.warning(
+                f"Device {device_id} is not available or does not have enough memory. Using CPU instead."
+            )
+            return "cpu"
+    return get_cuda_device_string()
+def get_optimal_device_name():
+    if config.runtime_env_vars.use_cpu:
+        return "cpu"
+    if torch.cuda.is_available():
+        return get_target_device_id_or_memory_available_gpu()
+    if has_mps():
+        return "mps"
+    return "cpu"
+def get_optimal_device():
+    return torch.device(get_optimal_device_name())
+def get_device_for(task):
+    if task in config.cmd_opts.use_cpu or "all" in config.cmd_opts.use_cpu:
+        return cpu
+    return get_optimal_device()
+def torch_gc():
+    try:
+        if torch.cuda.is_available():
+            with torch.cuda.device(get_cuda_device_string()):
+                torch.cuda.empty_cache()
+                torch.cuda.ipc_collect()
+        if has_mps():
+            mac_devices.torch_mps_gc()
+    except Exception as e:
+        logger.error(f"Error in torch_gc", exc_info=True)
+cpu: torch.device = torch.device("cpu")
+device: torch.device = get_optimal_device()
+dtype: torch.dtype = torch.float32
+dtype_dvae: torch.dtype = torch.float32
+dtype_vocos: torch.dtype = torch.float32
+dtype_gpt: torch.dtype = torch.float32
+dtype_decoder: torch.dtype = torch.float32
+def reset_device():
+    if config.runtime_env_vars.half:
+        global dtype
+        global dtype_dvae
+        global dtype_vocos
+        global dtype_gpt
+        global dtype_decoder
+        dtype = torch.float16
+        dtype_dvae = torch.float16
+        dtype_vocos = torch.float16
+        dtype_gpt = torch.float16
+        dtype_decoder = torch.float16
+        logger.info("Using half precision: torch.float16")
+    if (
+        config.runtime_env_vars.device_id is not None
+        or config.runtime_env_vars.use_cpu is not None
+    ):
+        global device
+        device = get_optimal_device()
+        logger.info(f"Using device: {device}")
+@lru_cache
+def first_time_calculation():
+    """
+    just do any calculation with pytorch layers - the first time this is done it allocaltes about 700MB of memory and
+    spends about 2.7 seconds doing that, at least wih NVidia.
+    """
+    x = torch.zeros((1, 1)).to(device, dtype)
+    linear = torch.nn.Linear(1, 1).to(device, dtype)
+    linear(x)
+    x = torch.zeros((1, 1, 3, 3)).to(device, dtype)
+    conv2d = torch.nn.Conv2d(1, 1, (3, 3)).to(device, dtype)
+    conv2d(x)

modules/devices/mac_devices.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+import logging
+from packaging import version
+import torch.backends
+import torch.backends.mps
+logger = logging.getLogger(__name__)
+def check_for_mps() -> bool:
+    if version.parse(torch.__version__) <= version.parse("2.0.1"):
+        if not getattr(torch, "has_mps", False):
+            return False
+        try:
+            torch.zeros(1).to(torch.device("mps"))
+            return True
+        except Exception:
+            return False
+    else:
+        try:
+            return torch.backends.mps.is_available() and torch.backends.mps.is_built()
+        except:
+            logger.warning("MPS garbage collection failed", exc_info=True)
+            return False
+has_mps = check_for_mps()
+def torch_mps_gc() -> None:
+    try:
+        from torch.mps import empty_cache
+        empty_cache()
+    except Exception:
+        logger.warning("MPS garbage collection failed", exc_info=True)
+if __name__ == "__main__":
+    print(torch.__version__)
+    print(has_mps)
+    torch_mps_gc()

modules/generate_audio.py CHANGED Viewed

@@ -8,18 +8,20 @@ from modules import models, config
 import logging
-from modules import devices
 logger = logging.getLogger(__name__)
-@torch.inference_mode()
 def generate_audio(
     text: str,
     temperature: float = 0.3,
     top_P: float = 0.7,
     top_K: float = 20,
-    spk: int | Speaker = -1,
     infer_seed: int = -1,
     use_decoder: bool = True,
     prompt1: str = "",
@@ -48,7 +50,7 @@ def generate_audio_batch(
     temperature: float = 0.3,
     top_P: float = 0.7,
     top_K: float = 20,
-    spk: int | Speaker = -1,
     infer_seed: int = -1,
     use_decoder: bool = True,
     prompt1: str = "",
@@ -65,7 +67,7 @@ def generate_audio_batch(
         "prompt2": prompt2 or "",
         "prefix": prefix or "",
         "repetition_penalty": 1.0,
-        "disable_tqdm": config.disable_tqdm,
     }
     if isinstance(spk, int):
@@ -103,6 +105,32 @@ def generate_audio_batch(
     return [(sample_rate, np.array(wav).flatten().astype(np.float32)) for wav in wavs]
 if __name__ == "__main__":
     import soundfile as sf

 import logging
+from modules.devices import devices
+from typing import Union
+from modules.utils.cache import conditional_cache
 logger = logging.getLogger(__name__)
 def generate_audio(
     text: str,
     temperature: float = 0.3,
     top_P: float = 0.7,
     top_K: float = 20,
+    spk: Union[int, Speaker] = -1,
     infer_seed: int = -1,
     use_decoder: bool = True,
     prompt1: str = "",
     temperature: float = 0.3,
     top_P: float = 0.7,
     top_K: float = 20,
+    spk: Union[int, Speaker] = -1,
     infer_seed: int = -1,
     use_decoder: bool = True,
     prompt1: str = "",
         "prompt2": prompt2 or "",
         "prefix": prefix or "",
         "repetition_penalty": 1.0,
+        "disable_tqdm": config.runtime_env_vars.off_tqdm,
     }
     if isinstance(spk, int):
     return [(sample_rate, np.array(wav).flatten().astype(np.float32)) for wav in wavs]
+lru_cache_enabled = False
+def setup_lru_cache():
+    global generate_audio_batch
+    global lru_cache_enabled
+    if lru_cache_enabled:
+        return
+    lru_cache_enabled = True
+    def should_cache(*args, **kwargs):
+        spk_seed = kwargs.get("spk", -1)
+        infer_seed = kwargs.get("infer_seed", -1)
+        return spk_seed != -1 and infer_seed != -1
+    lru_size = config.runtime_env_vars.lru_size
+    if isinstance(lru_size, int):
+        generate_audio_batch = conditional_cache(lru_size, should_cache)(
+            generate_audio_batch
+        )
+        logger.info(f"LRU cache enabled with size {lru_size}")
+    else:
+        logger.debug(f"LRU cache failed to enable, invalid size {lru_size}")
 if __name__ == "__main__":
     import soundfile as sf

modules/models.py CHANGED Viewed

@@ -1,15 +1,11 @@
-from modules.ChatTTS import ChatTTS
 import torch
 from modules import config
 import logging
 logger = logging.getLogger(__name__)
-device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-print(f"device use {device}")
 chat_tts = None
@@ -17,25 +13,33 @@ def load_chat_tts():
     global chat_tts
     if chat_tts:
         return chat_tts
     chat_tts = ChatTTS.Chat()
     chat_tts.load_models(
-        compile=config.enable_model_compile,
         source="local",
         local_path="./models/ChatTTS",
-        device=device,
     )
-    if config.model_config.get("half", False):
-        logging.info("half precision enabled")
-        for model_name, model in chat_tts.pretrain_models.items():
-            if isinstance(model, torch.nn.Module):
-                model.cpu()
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-                model.half()
-                if torch.cuda.is_available():
-                    model.cuda()
-                model.eval()
-                logger.log(logging.INFO, f"{model_name} converted to half precision.")
     return chat_tts

 import torch
+from modules.ChatTTS import ChatTTS
 from modules import config
+from modules.devices import devices
 import logging
 logger = logging.getLogger(__name__)
 chat_tts = None
     global chat_tts
     if chat_tts:
         return chat_tts
     chat_tts = ChatTTS.Chat()
     chat_tts.load_models(
+        compile=config.runtime_env_vars.compile,
         source="local",
         local_path="./models/ChatTTS",
+        device=devices.device,
+        dtype=devices.dtype,
+        dtype_vocos=devices.dtype_vocos,
+        dtype_dvae=devices.dtype_dvae,
+        dtype_gpt=devices.dtype_gpt,
+        dtype_decoder=devices.dtype_decoder,
     )
+    devices.torch_gc()
     return chat_tts
+def reload_chat_tts():
+    logging.info("Reloading ChatTTS models")
+    global chat_tts
+    if chat_tts:
+        if torch.cuda.is_available():
+            for model_name, model in chat_tts.pretrain_models.items():
+                if isinstance(model, torch.nn.Module):
+                    model.cpu()
+            torch.cuda.empty_cache()
+    chat_tts = None
+    return load_chat_tts()

modules/normalization.py CHANGED Viewed

@@ -1,6 +1,15 @@
 from modules.utils.zh_normalization.text_normlization import *
 import emojiswitch
 from modules.utils.markdown import markdown_to_text
 post_normalize_pipeline = []
 pre_normalize_pipeline = []
@@ -87,12 +96,17 @@ character_map = {
     ">": ",",
     "<": ",",
     "-": ",",
 }
 character_to_word = {
     " & ": " and ",
 }
 @post_normalize()
 def apply_character_to_word(text):
@@ -109,7 +123,8 @@ def apply_character_map(text):
 @post_normalize()
 def apply_emoji_map(text):
-    return emojiswitch.demojize(text, delimiters=("", ""), lang="zh")
 @post_normalize()
@@ -122,6 +137,26 @@ def insert_spaces_between_uppercase(s):
     )
 @pre_normalize()
 def apply_markdown_to_text(text):
     if is_markdown(text):
@@ -186,7 +221,7 @@ def sentence_normalize(sentence_text: str):
     pattern = re.compile(r"(\[.+?\])|([^[]+)")
     def normalize_part(part):
-        sentences = tx.normalize(part)
         dest_text = ""
         for sentence in sentences:
             sentence = apply_post_normalize(sentence)
@@ -244,6 +279,16 @@ console.log('1')
 “我们是玫瑰花。”花儿们说道。
 “啊！”小王子说……。
         """,
     ]
     for i, test_case in enumerate(test_cases):

 from modules.utils.zh_normalization.text_normlization import *
 import emojiswitch
 from modules.utils.markdown import markdown_to_text
+from modules import models
+import re
+def is_chinese(text):
+    # 中文字符的 Unicode 范围是 \u4e00-\u9fff
+    chinese_pattern = re.compile(r"[\u4e00-\u9fff]")
+    return bool(chinese_pattern.search(text))
 post_normalize_pipeline = []
 pre_normalize_pipeline = []
     ">": ",",
     "<": ",",
     "-": ",",
+    "~": " ",
+    "～": " ",
+    "/": " ",
 }
 character_to_word = {
     " & ": " and ",
 }
+## ---------- post normalize ----------
 @post_normalize()
 def apply_character_to_word(text):
 @post_normalize()
 def apply_emoji_map(text):
+    lang = "zh" if is_chinese(text) else "en"
+    return emojiswitch.demojize(text, delimiters=("", ""), lang=lang)
 @post_normalize()
     )
+@post_normalize()
+def replace_unk_tokens(text):
+    """
+    把不在字典里的字符替换为 " , "
+    """
+    chat_tts = models.load_chat_tts()
+    tokenizer = chat_tts.pretrain_models["tokenizer"]
+    vocab = tokenizer.get_vocab()
+    vocab_set = set(vocab.keys())
+    # 添加所有英语字符
+    vocab_set.update(set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"))
+    vocab_set.update(set(" \n\r\t"))
+    replaced_chars = [char if char in vocab_set else " , " for char in text]
+    output_text = "".join(replaced_chars)
+    return output_text
+## ---------- pre normalize ----------
 @pre_normalize()
 def apply_markdown_to_text(text):
     if is_markdown(text):
     pattern = re.compile(r"(\[.+?\])|([^[]+)")
     def normalize_part(part):
+        sentences = tx.normalize(part) if is_chinese(part) else [part]
         dest_text = ""
         for sentence in sentences:
             sentence = apply_post_normalize(sentence)
 “我们是玫瑰花。”花儿们说道。
 “啊！”小王子说……。
         """,
+        """
+State-of-the-art Machine Learning for PyTorch, TensorFlow, and JAX.
+🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as:
+📝 Natural Language Processing: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation.
+🖼️ Computer Vision: image classification, object detection, and segmentation.
+🗣️ Audio: automatic speech recognition and audio classification.
+🐙 Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
+        """,
     ]
     for i, test_case in enumerate(test_cases):

modules/refiner.py CHANGED Viewed

@@ -29,7 +29,7 @@ def refine_text(
                 "temperature": temperature,
                 "repetition_penalty": repetition_penalty,
                 "max_new_token": max_new_token,
-                "disable_tqdm": config.disable_tqdm,
             },
             do_text_normalization=False,
         )

                 "temperature": temperature,
                 "repetition_penalty": repetition_penalty,
                 "max_new_token": max_new_token,
+                "disable_tqdm": config.runtime_env_vars.off_tqdm,
             },
             do_text_normalization=False,
         )

modules/speaker.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import torch
 from modules import models
@@ -53,6 +54,14 @@ class Speaker:
         return is_update
 # 每个speaker就是一个 emb 文件 .pt
 # 管理 speaker 就是管理 ./data/speaker/ 下的所有 speaker
@@ -105,13 +114,13 @@ class SpeakerManager:
         self.refresh_speakers()
         return speaker
-    def get_speaker(self, name) -> Speaker | None:
         for speaker in self.speakers.values():
             if speaker.name == name:
                 return speaker
         return None
-    def get_speaker_by_id(self, id) -> Speaker | None:
         for speaker in self.speakers.values():
             if str(speaker.id) == str(id):
                 return speaker

 import os
+from typing import Union
 import torch
 from modules import models
         return is_update
+    def __hash__(self):
+        return hash(str(self.id))
+    def __eq__(self, other):
+        if not isinstance(other, Speaker):
+            return False
+        return str(self.id) == str(other.id)
 # 每个speaker就是一个 emb 文件 .pt
 # 管理 speaker 就是管理 ./data/speaker/ 下的所有 speaker
         self.refresh_speakers()
         return speaker
+    def get_speaker(self, name) -> Union[Speaker, None]:
         for speaker in self.speakers.values():
             if speaker.name == name:
                 return speaker
         return None
+    def get_speaker_by_id(self, id) -> Union[Speaker, None]:
         for speaker in self.speakers.values():
             if str(speaker.id) == str(id):
                 return speaker

modules/synthesize_audio.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import io
 from modules.SentenceSplitter import SentenceSplitter
 from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments
@@ -14,7 +15,7 @@ def synthesize_audio(
     temperature: float = 0.3,
     top_P: float = 0.7,
     top_K: float = 20,
-    spk: int | Speaker = -1,
     infer_seed: int = -1,
     use_decoder: bool = True,
     prompt1: str = "",

 import io
+from typing import Union
 from modules.SentenceSplitter import SentenceSplitter
 from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments
     temperature: float = 0.3,
     top_P: float = 0.7,
     top_K: float = 20,
+    spk: Union[int, Speaker] = -1,
     infer_seed: int = -1,
     use_decoder: bool = True,
     prompt1: str = "",

modules/utils/JsonObject.py ADDED Viewed

	@@ -0,0 +1,113 @@

+class JsonObject:
+    def __init__(self, initial_dict=None):
+        """
+        Initialize the JsonObject with an optional initial dictionary.
+        :param initial_dict: A dictionary to initialize the JsonObject.
+        """
+        # If no initial dictionary is provided, use an empty dictionary
+        self._dict_obj = initial_dict if initial_dict is not None else {}
+    def __getattr__(self, name):
+        """
+        Get an attribute value. If the attribute does not exist,
+        look it up in the internal dictionary.
+        :param name: The name of the attribute.
+        :return: The value of the attribute.
+        :raises AttributeError: If the attribute is not found in the dictionary.
+        """
+        try:
+            return self._dict_obj[name]
+        except KeyError:
+            return None
+    def __setattr__(self, name, value):
+        """
+        Set an attribute value. If the attribute name is '_dict_obj',
+        set it directly as an instance attribute. Otherwise,
+        store it in the internal dictionary.
+        :param name: The name of the attribute.
+        :param value: The value to set.
+        """
+        if name == "_dict_obj":
+            super().__setattr__(name, value)
+        else:
+            self._dict_obj[name] = value
+    def __delattr__(self, name):
+        """
+        Delete an attribute. If the attribute does not exist,
+        look it up in the internal dictionary and remove it.
+        :param name: The name of the attribute.
+        :raises AttributeError: If the attribute is not found in the dictionary.
+        """
+        try:
+            del self._dict_obj[name]
+        except KeyError:
+            return
+    def __getitem__(self, key):
+        """
+        Get an item value from the internal dictionary.
+        :param key: The key of the item.
+        :return: The value of the item.
+        :raises KeyError: If the key is not found in the dictionary.
+        """
+        if key not in self._dict_obj:
+            return None
+        return self._dict_obj[key]
+    def __setitem__(self, key, value):
+        """
+        Set an item value in the internal dictionary.
+        :param key: The key of the item.
+        :param value: The value to set.
+        """
+        self._dict_obj[key] = value
+    def __delitem__(self, key):
+        """
+        Delete an item from the internal dictionary.
+        :param key: The key of the item.
+        :raises KeyError: If the key is not found in the dictionary.
+        """
+        del self._dict_obj[key]
+    def to_dict(self):
+        """
+        Convert the JsonObject back to a regular dictionary.
+        :return: The internal dictionary.
+        """
+        return self._dict_obj
+    def has_key(self, key):
+        """
+        Check if the key exists in the internal dictionary.
+        :param key: The key to check.
+        :return: True if the key exists, False otherwise.
+        """
+        return key in self._dict_obj
+    def keys(self):
+        """
+        Get a list of keys in the internal dictionary.
+        :return: A list of keys.
+        """
+        return self._dict_obj.keys()
+    def values(self):
+        """
+        Get a list of values in the internal dictionary.
+        :return: A list of values.
+        """
+        return self._dict_obj.values()

modules/utils/cache.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from typing import Callable, TypeVar, Any
+from typing_extensions import ParamSpec
+from functools import lru_cache, _CacheInfo
+def conditional_cache(maxsize: int, condition: Callable):
+    def decorator(func):
+        @lru_cache_ext(maxsize=maxsize)
+        def cached_func(*args, **kwargs):
+            return func(*args, **kwargs)
+        def wrapper(*args, **kwargs):
+            if condition(*args, **kwargs):
+                return cached_func(*args, **kwargs)
+            else:
+                return func(*args, **kwargs)
+        return wrapper
+    return decorator
+def hash_list(l: list) -> int:
+    __hash = 0
+    for i, e in enumerate(l):
+        __hash = hash((__hash, i, hash_item(e)))
+    return __hash
+def hash_dict(d: dict) -> int:
+    __hash = 0
+    for k, v in d.items():
+        __hash = hash((__hash, k, hash_item(v)))
+    return __hash
+def hash_item(e) -> int:
+    if hasattr(e, "__hash__") and callable(e.__hash__):
+        try:
+            return hash(e)
+        except TypeError:
+            pass
+    if isinstance(e, (list, set, tuple)):
+        return hash_list(list(e))
+    elif isinstance(e, (dict)):
+        return hash_dict(e)
+    else:
+        raise TypeError(f"unhashable type: {e.__class__}")
+PT = ParamSpec("PT")
+RT = TypeVar("RT")
+def lru_cache_ext(
+    *opts, hashfunc: Callable[..., int] = hash_item, **kwopts
+) -> Callable[[Callable[PT, RT]], Callable[PT, RT]]:
+    def decorator(func: Callable[PT, RT]) -> Callable[PT, RT]:
+        class _lru_cache_ext_wrapper:
+            args: tuple
+            kwargs: dict[str, Any]
+            def cache_info(self) -> _CacheInfo: ...
+            def cache_clear(self) -> None: ...
+            @classmethod
+            @lru_cache(*opts, **kwopts)
+            def cached_func(cls, args_hash: int) -> RT:
+                return func(*cls.args, **cls.kwargs)
+            @classmethod
+            def __call__(cls, *args: PT.args, **kwargs: PT.kwargs) -> RT:
+                __hash = hashfunc(
+                    (
+                        id(func),
+                        *[hashfunc(a) for a in args],
+                        *[(hashfunc(k), hashfunc(v)) for k, v in kwargs.items()],
+                    )
+                )
+                cls.args = args
+                cls.kwargs = kwargs
+                cls.cache_info = cls.cached_func.cache_info
+                cls.cache_clear = cls.cached_func.cache_clear
+                return cls.cached_func(__hash)
+        return _lru_cache_ext_wrapper()
+    return decorator

modules/utils/zh_normalization/text_normlization.py CHANGED Viewed

@@ -72,9 +72,9 @@ class TextNormalizer():
         return sentences
     def _post_replace(self, sentence: str) -> str:
-        sentence = sentence.replace('/', '每')
-        sentence = sentence.replace('~', '至')
-        sentence = sentence.replace('～', '至')
         sentence = sentence.replace('①', '一')
         sentence = sentence.replace('②', '二')
         sentence = sentence.replace('③', '三')

         return sentences
     def _post_replace(self, sentence: str) -> str:
+        # sentence = sentence.replace('/', '每')
+        # sentence = sentence.replace('~', '至')
+        # sentence = sentence.replace('～', '至')
         sentence = sentence.replace('①', '一')
         sentence = sentence.replace('②', '二')
         sentence = sentence.replace('③', '三')

webui.py CHANGED Viewed

@@ -14,9 +14,11 @@ except:
 import os
 import logging
-from numpy import clip
 from modules.synthesize_audio import synthesize_audio
 logging.basicConfig(
     level=os.getenv("LOG_LEVEL", "INFO"),
@@ -25,20 +27,17 @@ logging.basicConfig(
 import gradio as gr
-import io
-import re
-import numpy as np
 import torch
 from modules.ssml import parse_ssml
 from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments
-from modules.generate_audio import generate_audio, generate_audio_batch
 from modules.speaker import speaker_mgr
 from modules.data import styles_mgr
 from modules.api.utils import calc_spk_style
 from modules.normalization import text_normalize
 from modules import refiner, config
@@ -147,7 +146,7 @@ def tts_generate(
     prompt1 = prompt1 or params.get("prompt1", "")
     prompt2 = prompt2 or params.get("prompt2", "")
-    infer_seed = clip(infer_seed, -1, 2**32 - 1)
     infer_seed = int(infer_seed)
     if not disable_normalize:
@@ -869,31 +868,59 @@ if __name__ == "__main__":
         type=int,
         help="Max batch size for TTS",
     )
     args = parser.parse_args()
-    server_name = env.get_env_or_arg(args, "server_name", "0.0.0.0", str)
-    server_port = env.get_env_or_arg(args, "server_port", 7860, int)
-    share = env.get_env_or_arg(args, "share", False, bool)
-    debug = env.get_env_or_arg(args, "debug", False, bool)
-    auth = env.get_env_or_arg(args, "auth", None, str)
-    half = env.get_env_or_arg(args, "half", False, bool)
-    off_tqdm = env.get_env_or_arg(args, "off_tqdm", False, bool)
-    webui_config["tts_max"] = env.get_env_or_arg(args, "tts_max_len", 1000, int)
-    webui_config["ssml_max"] = env.get_env_or_arg(args, "ssml_max_len", 5000, int)
-    webui_config["max_batch_size"] = env.get_env_or_arg(args, "max_batch_size", 8, int)
     demo = create_interface()
     if auth:
         auth = tuple(auth.split(":"))
-    if half:
-        config.model_config["half"] = True
-    if off_tqdm:
-        config.disable_tqdm = True
     demo.queue().launch(
         server_name=server_name,

 import os
 import logging
+import numpy as np
+from modules.devices import devices
 from modules.synthesize_audio import synthesize_audio
+from modules.utils.cache import conditional_cache
 logging.basicConfig(
     level=os.getenv("LOG_LEVEL", "INFO"),
 import gradio as gr
 import torch
 from modules.ssml import parse_ssml
 from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments
 from modules.speaker import speaker_mgr
 from modules.data import styles_mgr
 from modules.api.utils import calc_spk_style
+import modules.generate_audio as generate
 from modules.normalization import text_normalize
 from modules import refiner, config
     prompt1 = prompt1 or params.get("prompt1", "")
     prompt2 = prompt2 or params.get("prompt2", "")
+    infer_seed = np.clip(infer_seed, -1, 2**32 - 1)
     infer_seed = int(infer_seed)
     if not disable_normalize:
         type=int,
         help="Max batch size for TTS",
     )
+    parser.add_argument(
+        "--lru_size",
+        type=int,
+        default=64,
+        help="Set the size of the request cache pool, set it to 0 will disable lru_cache",
+    )
+    parser.add_argument(
+        "--device_id",
+        type=str,
+        help="Select the default CUDA device to use (export CUDA_VISIBLE_DEVICES=0,1,etc might be needed before)",
+        default=None,
+    )
+    parser.add_argument(
+        "--use_cpu",
+        nargs="+",
+        help="use CPU as torch device for specified modules",
+        default=[],
+        type=str.lower,
+    )
+    parser.add_argument("--compile", action="store_true", help="Enable model compile")
     args = parser.parse_args()
+    def get_and_update_env(*args):
+        val = env.get_env_or_arg(*args)
+        key = args[1]
+        config.runtime_env_vars[key] = val
+        return val
+    server_name = get_and_update_env(args, "server_name", "0.0.0.0", str)
+    server_port = get_and_update_env(args, "server_port", 7860, int)
+    share = get_and_update_env(args, "share", False, bool)
+    debug = get_and_update_env(args, "debug", False, bool)
+    auth = get_and_update_env(args, "auth", None, str)
+    half = get_and_update_env(args, "half", False, bool)
+    off_tqdm = get_and_update_env(args, "off_tqdm", False, bool)
+    lru_size = get_and_update_env(args, "lru_size", 64, int)
+    device_id = get_and_update_env(args, "device_id", None, str)
+    use_cpu = get_and_update_env(args, "use_cpu", [], list)
+    compile = get_and_update_env(args, "compile", False, bool)
+    webui_config["tts_max"] = get_and_update_env(args, "tts_max_len", 1000, int)
+    webui_config["ssml_max"] = get_and_update_env(args, "ssml_max_len", 5000, int)
+    webui_config["max_batch_size"] = get_and_update_env(args, "max_batch_size", 8, int)
     demo = create_interface()
     if auth:
         auth = tuple(auth.split(":"))
+    generate.setup_lru_cache()
+    devices.reset_device()
+    devices.first_time_calculation()
     demo.queue().launch(
         server_name=server_name,