Spaces:
Running
Running
File size: 10,162 Bytes
01e655b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 |
from lxml import etree
from typing import Any, List, Dict
import numpy as np
import logging
from modules.data import styles_mgr
from modules.speaker import speaker_mgr
import random
logger = logging.getLogger(__name__)
def expand_spk(attrs: dict):
input_spk = attrs.get("spk", "")
if isinstance(input_spk, int):
return
if isinstance(input_spk, str) and input_spk.isdigit():
attrs.update({"spk": int(input_spk)})
return
try:
speaker = speaker_mgr.get_speaker(input_spk)
attrs.update({"spk": speaker})
except Exception as e:
logger.error(f"apply style failed, {e}")
def expand_style(attrs: dict):
if attrs.get("style", "") != "":
try:
params = styles_mgr.find_params_by_name(str(attrs["style"]))
attrs.update(params)
except Exception as e:
logger.error(f"apply style failed, {e}")
def merge_prompt(attrs: dict, elem):
def attr_num(attrs: Dict[str, Any], k: str, min_value: int, max_value: int):
val = elem.get(k, attrs.get(k, ""))
if val == "":
return
if val == "max":
val = max_value
if val == "min":
val = min_value
val = np.clip(int(val), min_value, max_value)
if "prefix" not in attrs or attrs["prefix"] == None:
attrs["prefix"] = ""
attrs["prefix"] += " " + f"[{k}_{val}]"
attr_num(attrs, "oral", 0, 9)
attr_num(attrs, "speed", 0, 9)
attr_num(attrs, "laugh", 0, 2)
attr_num(attrs, "break", 0, 7)
def apply_random_seed(attrs: dict):
seed = attrs.get("seed", "")
if seed == "random" or seed == "rand":
seed = random.randint(0, 2**32 - 1)
attrs["seed"] = seed
logger.info(f"random seed: {seed}")
class NotSupportSSML(Exception):
pass
def parse_ssml(ssml: str) -> List[Dict[str, Any]]:
root = etree.fromstring(ssml)
ssml_version = root.get("version", "NONE")
if ssml_version != "0.1":
raise NotSupportSSML("Unsupported ssml version: {ssml_version}")
segments = []
for voice in root.findall(".//voice"):
voice_attrs = {
"spk": voice.get("spk"),
"style": voice.get("style"),
"seed": voice.get("seed"),
"top_p": voice.get("top_p"),
"top_k": voice.get("top_k"),
"temp": voice.get("temp"),
"prompt1": voice.get("prompt1"),
"prompt2": voice.get("prompt2"),
"prefix": voice.get("prefix"),
"normalize": voice.get("normalize"),
}
voice_attrs = {k: v for k, v in voice_attrs.items() if v is not None}
expand_spk(voice_attrs)
expand_style(voice_attrs)
merge_prompt(voice_attrs, voice)
apply_random_seed(voice_attrs)
voice_segments = []
if voice_attrs.get("temp", "") == "min":
# ref: https://github.com/2noise/ChatTTS/issues/123#issue-2326908144
voice_attrs["temp"] = 0.000000000001
if voice_attrs.get("temp", "") == "max":
voice_attrs["temp"] = 1
# 处理 voice 开头的文本
if voice.text and voice.text.strip():
voice_segments.append(
{"text": voice.text.strip(), "attrs": voice_attrs.copy()}
)
# 处理 voice 内部的文本和 prosody 元素
for node in voice.iterchildren():
if node.tag == "prosody":
prosody_attrs = voice_attrs.copy()
new_attrs = {
"rate": node.get("rate"),
"volume": node.get("volume"),
"pitch": node.get("pitch"),
}
prosody_attrs.update(
{k: v for k, v in new_attrs.items() if v is not None}
)
expand_style(prosody_attrs)
merge_prompt(prosody_attrs, node)
apply_random_seed(voice_attrs)
if node.text and node.text.strip():
voice_segments.append(
{"text": node.text.strip(), "attrs": prosody_attrs}
)
elif node.tag == "break":
time_ms = int(node.get("time", "0").replace("ms", ""))
segment = {"break": time_ms}
voice_segments.append(segment)
if node.tail and node.tail.strip():
voice_segments.append(
{"text": node.tail.strip(), "attrs": voice_attrs.copy()}
)
end_segment = voice_segments[-1]
end_segment["is_end"] = True
segments = segments + voice_segments
logger.info(f"collect len(segments): {len(segments)}")
# logger.info(f"segments: {json.dumps(segments, ensure_ascii=False)}")
return segments
if __name__ == "__main__":
# 示例 SSML 输入
ssml1 = """
<speak version="0.1">
<voice spk="20398768" seed="42" temp="min" top_p="0.9" top_k="20">
电影中梁朝伟扮演的陈永仁的
<prosody volume="5">
编号27149
</prosody>
<prosody rate="2">
编号27149
</prosody>
<prosody pitch="-12">
编号27149
</prosody>
<prosody pitch="12">
编号27149
</prosody>
</voice>
<voice spk="20398768" seed="42" speed="9">
编号27149
</voice>
<voice spk="20398768" seed="42">
电影中梁朝伟扮演的陈永仁的编号27149
</voice>
</speak>
"""
ssml2 = """
<speak version="0.1">
<voice spk="Bob">
也可以合成多角色多情感的有声 [uv_break] 书 [uv_break] ,例如:
</voice>
<voice spk="Bob">
黛玉冷笑道:
</voice>
<voice spk="female2">
我说呢,亏了绊住,不然,早就飞了来了。
</voice>
<voice spk="Bob" speed="0">
宝玉道:
</voice>
<voice spk="Alice">
“只许和你玩,替你解闷。不过偶然到他那里,就说这些闲话。”
</voice>
<voice spk="female2">
“好没意思的话!去不去,关我什么事儿?又没叫你替我解闷儿,还许你不理我呢”
</voice>
<voice spk="Bob">
说着,便赌气回房去了。
</voice>
</speak>
"""
ssml22 = """
<speak version="0.1">
<voice spk="Bob" style="narration-relaxed">
下面是一个 ChatTTS 用于合成多角色多情感的有声书示例
</voice>
<voice spk="Bob" style="narration-relaxed">
黛玉冷笑道:
</voice>
<voice spk="female2" style="angry">
我说呢 [uv_break] ,亏了绊住,不然,早就飞起来了。
</voice>
<voice spk="Bob" style="narration-relaxed">
宝玉道:
</voice>
<voice spk="Alice" style="unfriendly">
“只许和你玩 [uv_break] ,替你解闷。不过偶然到他那里,就说这些闲话。”
</voice>
<voice spk="female2" style="angry">
“好没意思的话![uv_break] 去不去,关我什么事儿? 又没叫你替我解闷儿 [uv_break],还许你不理我呢”
</voice>
<voice spk="Bob" style="narration-relaxed">
说着,便赌气回房去了。
</voice>
</speak>
"""
ssml3 = """
<speak version="0.1">
<voice spk="Bob" style="angry">
“你到底在想什么?这已经是第三次了!每次我都告诉你要按时完成任务,可你总是拖延。你知道这对整个团队有多大的影响吗?!”
</voice>
<voice spk="Bob" style="assistant">
“你到底在想什么?这已经是第三次了!每次我都告诉你要按时完成任务,可你总是拖延。你知道这对整个团队有多大的影响吗?!”
</voice>
<voice spk="Bob" style="gentle">
“你到底在想什么?这已经是第三次了!每次我都告诉你要按时完成任务,可你总是拖延。你知道这对整个团队有多大的影响吗?!”
</voice>
</speak>
"""
ssml4 = """
<speak version="0.1">
<voice spk="Bob" style="narration-relaxed">
使用 prosody 控制生成文本的语速语调和音量,示例如下
<prosody>
无任何限制将会继承父级voice配置进行生成
</prosody>
<prosody rate="1.5">
设置 rate 大于1表示加速,小于1为减速
</prosody>
<prosody pitch="6">
设置 pitch 调整音调,设置为6表示提高6个半音
</prosody>
<prosody volume="2">
设置 volume 调整音量,设置为2表示提高2个分贝
</prosody>
在 voice 中无prosody包裹的文本即为默认生成状态下的语音
</voice>
</speak>
"""
ssml5 = """
<speak version="0.1">
<voice spk="Bob" style="narration-relaxed">
使用 break 标签将会简单的
<break time="500" />
插入一段空白到生成结果中
</voice>
</speak>
"""
ssml6 = """
<speak version="0.1">
<voice spk="Bob" style="excited">
temperature for sampling (may be overridden by style or speaker)
<break time="500" />
温度值用于采样,这个值有可能被 style 或者 speaker 覆盖
<break time="500" />
temperature for sampling ,这个值有可能被 style 或者 speaker 覆盖
<break time="500" />
温度值用于采样,(may be overridden by style or speaker)
</voice>
</speak>
"""
segments = parse_ssml(ssml6)
print(segments)
# audio_segments = synthesize_segments(segments)
# combined_audio = combine_audio_segments(audio_segments)
# combined_audio.export("output.wav", format="wav")
|