import os import logging import sys import numpy as np from modules.devices import devices from modules.synthesize_audio import synthesize_audio from modules.hf import spaces from modules.webui import webui_config logging.basicConfig( level=os.getenv("LOG_LEVEL", "INFO"), format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) import gradio as gr import torch from modules.ssml import parse_ssml from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments from modules.speaker import speaker_mgr from modules.data import styles_mgr from modules.api.utils import calc_spk_style import modules.generate_audio as generate from modules.normalization import text_normalize from modules import refiner, config from modules.utils import env, audio from modules.SentenceSplitter import SentenceSplitter def get_speakers(): return speaker_mgr.list_speakers() def get_styles(): return styles_mgr.list_items() def segments_length_limit(segments, total_max: int): ret_segments = [] total_len = 0 for seg in segments: if "text" not in seg: continue total_len += len(seg["text"]) if total_len > total_max: break ret_segments.append(seg) return ret_segments @torch.inference_mode() @spaces.GPU def synthesize_ssml(ssml: str, batch_size=4): try: batch_size = int(batch_size) except Exception: batch_size = 8 ssml = ssml.strip() if ssml == "": return None segments = parse_ssml(ssml) max_len = webui_config.ssml_max segments = segments_length_limit(segments, max_len) if len(segments) == 0: return None synthesize = SynthesizeSegments(batch_size=batch_size) audio_segments = synthesize.synthesize_segments(segments) combined_audio = combine_audio_segments(audio_segments) return audio.pydub_to_np(combined_audio) @torch.inference_mode() @spaces.GPU def tts_generate( text, temperature, top_p, top_k, spk, infer_seed, use_decoder, prompt1, prompt2, prefix, style, disable_normalize=False, batch_size=4, ): try: batch_size = int(batch_size) except Exception: batch_size = 4 max_len = webui_config.tts_max text = text.strip()[0:max_len] if text == "": return None if style == "*auto": style = None if isinstance(top_k, float): top_k = int(top_k) params = calc_spk_style(spk=spk, style=style) spk = params.get("spk", spk) infer_seed = infer_seed or params.get("seed", infer_seed) temperature = temperature or params.get("temperature", temperature) prefix = prefix or params.get("prefix", prefix) prompt1 = prompt1 or params.get("prompt1", "") prompt2 = prompt2 or params.get("prompt2", "") infer_seed = np.clip(infer_seed, -1, 2**32 - 1, out=None, dtype=np.int64) infer_seed = int(infer_seed) if not disable_normalize: text = text_normalize(text) sample_rate, audio_data = synthesize_audio( text=text, temperature=temperature, top_P=top_p, top_K=top_k, spk=spk, infer_seed=infer_seed, use_decoder=use_decoder, prompt1=prompt1, prompt2=prompt2, prefix=prefix, batch_size=batch_size, ) audio_data = audio.audio_to_int16(audio_data) return sample_rate, audio_data @torch.inference_mode() @spaces.GPU def refine_text(text: str, prompt: str): text = text_normalize(text) return refiner.refine_text(text, prompt=prompt) @torch.inference_mode() @spaces.GPU def split_long_text(long_text_input): spliter = SentenceSplitter(webui_config.spliter_threshold) sentences = spliter.parse(long_text_input) sentences = [text_normalize(s) for s in sentences] data = [] for i, text in enumerate(sentences): data.append([i, text, len(text)]) return data