Spaces:
Sleeping
Sleeping
import os | |
import logging | |
import sys | |
import numpy as np | |
from modules.devices import devices | |
from modules.synthesize_audio import synthesize_audio | |
from modules.hf import spaces | |
from modules.webui import webui_config | |
logging.basicConfig( | |
level=os.getenv("LOG_LEVEL", "INFO"), | |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", | |
) | |
import gradio as gr | |
import torch | |
from modules.ssml import parse_ssml | |
from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments | |
from modules.speaker import speaker_mgr | |
from modules.data import styles_mgr | |
from modules.api.utils import calc_spk_style | |
import modules.generate_audio as generate | |
from modules.normalization import text_normalize | |
from modules import refiner, config | |
from modules.utils import env, audio | |
from modules.SentenceSplitter import SentenceSplitter | |
def get_speakers(): | |
return speaker_mgr.list_speakers() | |
def get_styles(): | |
return styles_mgr.list_items() | |
def segments_length_limit(segments, total_max: int): | |
ret_segments = [] | |
total_len = 0 | |
for seg in segments: | |
if "text" not in seg: | |
continue | |
total_len += len(seg["text"]) | |
if total_len > total_max: | |
break | |
ret_segments.append(seg) | |
return ret_segments | |
def synthesize_ssml(ssml: str, batch_size=4): | |
try: | |
batch_size = int(batch_size) | |
except Exception: | |
batch_size = 8 | |
ssml = ssml.strip() | |
if ssml == "": | |
return None | |
segments = parse_ssml(ssml) | |
max_len = webui_config.ssml_max | |
segments = segments_length_limit(segments, max_len) | |
if len(segments) == 0: | |
return None | |
synthesize = SynthesizeSegments(batch_size=batch_size) | |
audio_segments = synthesize.synthesize_segments(segments) | |
combined_audio = combine_audio_segments(audio_segments) | |
return audio.pydub_to_np(combined_audio) | |
def tts_generate( | |
text, | |
temperature, | |
top_p, | |
top_k, | |
spk, | |
infer_seed, | |
use_decoder, | |
prompt1, | |
prompt2, | |
prefix, | |
style, | |
disable_normalize=False, | |
batch_size=4, | |
): | |
try: | |
batch_size = int(batch_size) | |
except Exception: | |
batch_size = 4 | |
max_len = webui_config.tts_max | |
text = text.strip()[0:max_len] | |
if text == "": | |
return None | |
if style == "*auto": | |
style = None | |
if isinstance(top_k, float): | |
top_k = int(top_k) | |
params = calc_spk_style(spk=spk, style=style) | |
spk = params.get("spk", spk) | |
infer_seed = infer_seed or params.get("seed", infer_seed) | |
temperature = temperature or params.get("temperature", temperature) | |
prefix = prefix or params.get("prefix", prefix) | |
prompt1 = prompt1 or params.get("prompt1", "") | |
prompt2 = prompt2 or params.get("prompt2", "") | |
infer_seed = np.clip(infer_seed, -1, 2**32 - 1, out=None, dtype=np.int64) | |
infer_seed = int(infer_seed) | |
if not disable_normalize: | |
text = text_normalize(text) | |
sample_rate, audio_data = synthesize_audio( | |
text=text, | |
temperature=temperature, | |
top_P=top_p, | |
top_K=top_k, | |
spk=spk, | |
infer_seed=infer_seed, | |
use_decoder=use_decoder, | |
prompt1=prompt1, | |
prompt2=prompt2, | |
prefix=prefix, | |
batch_size=batch_size, | |
) | |
audio_data = audio.audio_to_int16(audio_data) | |
return sample_rate, audio_data | |
def refine_text(text: str, prompt: str): | |
text = text_normalize(text) | |
return refiner.refine_text(text, prompt=prompt) | |
def split_long_text(long_text_input): | |
spliter = SentenceSplitter(webui_config.spliter_threshold) | |
sentences = spliter.parse(long_text_input) | |
sentences = [text_normalize(s) for s in sentences] | |
data = [] | |
for i, text in enumerate(sentences): | |
data.append([i, text, len(text)]) | |
return data | |