Spaces:
Running
Running
zhzluke96
commited on
Commit
•
d5b3cd8
1
Parent(s):
84cfd61
update
Browse files- modules/SynthesizeSegments.py +8 -8
- modules/api/impl/ssml_api.py +9 -4
- modules/synthesize_audio.py +27 -19
- modules/utils/audio.py +21 -3
- webui.py +16 -32
modules/SynthesizeSegments.py
CHANGED
@@ -9,6 +9,7 @@ from modules.normalization import text_normalize
|
|
9 |
import logging
|
10 |
import json
|
11 |
import random
|
|
|
12 |
|
13 |
from modules.speaker import Speaker
|
14 |
|
@@ -61,6 +62,9 @@ class SynthesizeSegments:
|
|
61 |
self.batch_size = batch_size
|
62 |
|
63 |
def segment_to_generate_params(self, segment: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
|
|
64 |
text = segment.get("text", "")
|
65 |
is_end = segment.get("is_end", False)
|
66 |
|
@@ -111,19 +115,15 @@ class SynthesizeSegments:
|
|
111 |
for segment in segments:
|
112 |
params = self.segment_to_generate_params(segment)
|
113 |
|
114 |
-
key_params = params
|
115 |
if isinstance(key_params.get("spk"), Speaker):
|
116 |
key_params["spk"] = str(key_params["spk"].id)
|
117 |
key = json.dumps(
|
118 |
{k: v for k, v in key_params.items() if k != "text"}, sort_keys=True
|
119 |
)
|
120 |
-
if
|
121 |
-
key =
|
122 |
-
|
123 |
-
else:
|
124 |
-
if key not in buckets:
|
125 |
-
buckets[key] = []
|
126 |
-
buckets[key].append(segment)
|
127 |
|
128 |
# Convert dictionary to list of buckets
|
129 |
bucket_list = list(buckets.values())
|
|
|
9 |
import logging
|
10 |
import json
|
11 |
import random
|
12 |
+
import copy
|
13 |
|
14 |
from modules.speaker import Speaker
|
15 |
|
|
|
62 |
self.batch_size = batch_size
|
63 |
|
64 |
def segment_to_generate_params(self, segment: Dict[str, Any]) -> Dict[str, Any]:
|
65 |
+
if segment.get("params", None) is not None:
|
66 |
+
return segment["params"]
|
67 |
+
|
68 |
text = segment.get("text", "")
|
69 |
is_end = segment.get("is_end", False)
|
70 |
|
|
|
115 |
for segment in segments:
|
116 |
params = self.segment_to_generate_params(segment)
|
117 |
|
118 |
+
key_params = copy.copy(params)
|
119 |
if isinstance(key_params.get("spk"), Speaker):
|
120 |
key_params["spk"] = str(key_params["spk"].id)
|
121 |
key = json.dumps(
|
122 |
{k: v for k, v in key_params.items() if k != "text"}, sort_keys=True
|
123 |
)
|
124 |
+
if key not in buckets:
|
125 |
+
buckets[key] = []
|
126 |
+
buckets[key].append(segment)
|
|
|
|
|
|
|
|
|
127 |
|
128 |
# Convert dictionary to list of buckets
|
129 |
bucket_list = list(buckets.values())
|
modules/api/impl/ssml_api.py
CHANGED
@@ -23,7 +23,7 @@ from modules.api.Api import APIManager
|
|
23 |
class SSMLRequest(BaseModel):
|
24 |
ssml: str
|
25 |
format: str = "mp3"
|
26 |
-
|
27 |
|
28 |
|
29 |
async def synthesize_ssml(
|
@@ -34,7 +34,12 @@ async def synthesize_ssml(
|
|
34 |
try:
|
35 |
ssml = request.ssml
|
36 |
format = request.format
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
if not ssml:
|
40 |
raise HTTPException(status_code=400, detail="SSML content is required.")
|
@@ -43,8 +48,8 @@ async def synthesize_ssml(
|
|
43 |
for seg in segments:
|
44 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
45 |
|
46 |
-
if
|
47 |
-
synthesize = SynthesizeSegments(
|
48 |
audio_segments = synthesize.synthesize_segments(segments)
|
49 |
combined_audio = combine_audio_segments(audio_segments)
|
50 |
buffer = io.BytesIO()
|
|
|
23 |
class SSMLRequest(BaseModel):
|
24 |
ssml: str
|
25 |
format: str = "mp3"
|
26 |
+
batch_size: int = 4
|
27 |
|
28 |
|
29 |
async def synthesize_ssml(
|
|
|
34 |
try:
|
35 |
ssml = request.ssml
|
36 |
format = request.format
|
37 |
+
batch_size = request.batch_size
|
38 |
+
|
39 |
+
if batch_size < 1:
|
40 |
+
raise HTTPException(
|
41 |
+
status_code=400, detail="Batch size must be greater than 0."
|
42 |
+
)
|
43 |
|
44 |
if not ssml:
|
45 |
raise HTTPException(status_code=400, detail="SSML content is required.")
|
|
|
48 |
for seg in segments:
|
49 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
50 |
|
51 |
+
if batch_size != 1:
|
52 |
+
synthesize = SynthesizeSegments(batch_size)
|
53 |
audio_segments = synthesize.synthesize_segments(segments)
|
54 |
combined_audio = combine_audio_segments(audio_segments)
|
55 |
buffer = io.BytesIO()
|
modules/synthesize_audio.py
CHANGED
@@ -1,12 +1,12 @@
|
|
|
|
1 |
from modules.SentenceSplitter import SentenceSplitter
|
2 |
-
from modules.
|
3 |
|
4 |
from modules import generate_audio as generate
|
5 |
|
6 |
|
7 |
-
import numpy as np
|
8 |
-
|
9 |
from modules.speaker import Speaker
|
|
|
10 |
|
11 |
|
12 |
def synthesize_audio(
|
@@ -39,20 +39,28 @@ def synthesize_audio(
|
|
39 |
else:
|
40 |
spliter = SentenceSplitter(spliter_threshold)
|
41 |
sentences = spliter.parse(text)
|
42 |
-
sentences = [text_normalize(s) for s in sentences]
|
43 |
-
audio_data_batch = generate.generate_audio_batch(
|
44 |
-
texts=sentences,
|
45 |
-
temperature=temperature,
|
46 |
-
top_P=top_P,
|
47 |
-
top_K=top_K,
|
48 |
-
spk=spk,
|
49 |
-
infer_seed=infer_seed,
|
50 |
-
use_decoder=use_decoder,
|
51 |
-
prompt1=prompt1,
|
52 |
-
prompt2=prompt2,
|
53 |
-
prefix=prefix,
|
54 |
-
)
|
55 |
-
sample_rate = audio_data_batch[0][0]
|
56 |
-
audio_data = np.concatenate([data for _, data in audio_data_batch])
|
57 |
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
from modules.SentenceSplitter import SentenceSplitter
|
3 |
+
from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments
|
4 |
|
5 |
from modules import generate_audio as generate
|
6 |
|
7 |
|
|
|
|
|
8 |
from modules.speaker import Speaker
|
9 |
+
from modules.utils import audio
|
10 |
|
11 |
|
12 |
def synthesize_audio(
|
|
|
39 |
else:
|
40 |
spliter = SentenceSplitter(spliter_threshold)
|
41 |
sentences = spliter.parse(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
text_segments = [
|
44 |
+
{
|
45 |
+
"text": s,
|
46 |
+
"params": {
|
47 |
+
"text": s,
|
48 |
+
"temperature": temperature,
|
49 |
+
"top_P": top_P,
|
50 |
+
"top_K": top_K,
|
51 |
+
"spk": spk,
|
52 |
+
"infer_seed": infer_seed,
|
53 |
+
"use_decoder": use_decoder,
|
54 |
+
"prompt1": prompt1,
|
55 |
+
"prompt2": prompt2,
|
56 |
+
"prefix": prefix,
|
57 |
+
},
|
58 |
+
}
|
59 |
+
for s in sentences
|
60 |
+
]
|
61 |
+
synthesizer = SynthesizeSegments(batch_size)
|
62 |
+
audio_segments = synthesizer.synthesize_segments(text_segments)
|
63 |
+
|
64 |
+
combined_audio = combine_audio_segments(audio_segments)
|
65 |
+
|
66 |
+
return audio.pydub_to_np(combined_audio)
|
modules/utils/audio.py
CHANGED
@@ -9,9 +9,12 @@ INT16_MAX = np.iinfo(np.int16).max
|
|
9 |
|
10 |
|
11 |
def audio_to_int16(audio_data):
|
12 |
-
if
|
13 |
-
audio_data
|
14 |
-
|
|
|
|
|
|
|
15 |
audio_data = (audio_data * INT16_MAX).astype(np.int16)
|
16 |
return audio_data
|
17 |
|
@@ -27,6 +30,21 @@ def audiosegment_to_librosawav(audiosegment):
|
|
27 |
return fp_arr
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
def ndarray_to_segment(ndarray, frame_rate):
|
31 |
buffer = BytesIO()
|
32 |
sf.write(buffer, ndarray, frame_rate, format="wav")
|
|
|
9 |
|
10 |
|
11 |
def audio_to_int16(audio_data):
|
12 |
+
if (
|
13 |
+
audio_data.dtype == np.float32
|
14 |
+
or audio_data.dtype == np.float64
|
15 |
+
or audio_data.dtype == np.float128
|
16 |
+
or audio_data.dtype == np.float16
|
17 |
+
):
|
18 |
audio_data = (audio_data * INT16_MAX).astype(np.int16)
|
19 |
return audio_data
|
20 |
|
|
|
30 |
return fp_arr
|
31 |
|
32 |
|
33 |
+
def pydub_to_np(audio: AudioSegment) -> tuple[int, np.ndarray]:
|
34 |
+
"""
|
35 |
+
Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
|
36 |
+
where each value is in range [-1.0, 1.0].
|
37 |
+
Returns tuple (audio_np_array, sample_rate).
|
38 |
+
"""
|
39 |
+
return (
|
40 |
+
audio.frame_rate,
|
41 |
+
np.array(audio.get_array_of_samples(), dtype=np.float32).reshape(
|
42 |
+
(-1, audio.channels)
|
43 |
+
)
|
44 |
+
/ (1 << (8 * audio.sample_width - 1)),
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
def ndarray_to_segment(ndarray, frame_rate):
|
49 |
buffer = BytesIO()
|
50 |
sf.write(buffer, ndarray, frame_rate, format="wav")
|
webui.py
CHANGED
@@ -16,6 +16,8 @@ import logging
|
|
16 |
|
17 |
from numpy import clip
|
18 |
|
|
|
|
|
19 |
logging.basicConfig(
|
20 |
level=os.getenv("LOG_LEVEL", "INFO"),
|
21 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
@@ -145,8 +147,8 @@ def tts_generate(
|
|
145 |
top_k = int(top_k)
|
146 |
|
147 |
params = calc_spk_style(spk=spk, style=style)
|
148 |
-
|
149 |
spk = params.get("spk", spk)
|
|
|
150 |
infer_seed = infer_seed or params.get("seed", infer_seed)
|
151 |
temperature = temperature or params.get("temperature", temperature)
|
152 |
prefix = prefix or params.get("prefix", prefix)
|
@@ -159,37 +161,19 @@ def tts_generate(
|
|
159 |
if not disable_normalize:
|
160 |
text = text_normalize(text)
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
else:
|
176 |
-
spliter = SentenceSplitter(webui_config["spliter_threshold"])
|
177 |
-
sentences = spliter.parse(text)
|
178 |
-
sentences = [text_normalize(s) for s in sentences]
|
179 |
-
audio_data_batch = generate_audio_batch(
|
180 |
-
texts=sentences,
|
181 |
-
temperature=temperature,
|
182 |
-
top_P=top_p,
|
183 |
-
top_K=top_k,
|
184 |
-
spk=spk,
|
185 |
-
infer_seed=infer_seed,
|
186 |
-
use_decoder=use_decoder,
|
187 |
-
prompt1=prompt1,
|
188 |
-
prompt2=prompt2,
|
189 |
-
prefix=prefix,
|
190 |
-
)
|
191 |
-
sample_rate = audio_data_batch[0][0]
|
192 |
-
audio_data = np.concatenate([data for _, data in audio_data_batch])
|
193 |
|
194 |
audio_data = audio.audio_to_int16(audio_data)
|
195 |
return sample_rate, audio_data
|
|
|
16 |
|
17 |
from numpy import clip
|
18 |
|
19 |
+
from modules.synthesize_audio import synthesize_audio
|
20 |
+
|
21 |
logging.basicConfig(
|
22 |
level=os.getenv("LOG_LEVEL", "INFO"),
|
23 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
|
147 |
top_k = int(top_k)
|
148 |
|
149 |
params = calc_spk_style(spk=spk, style=style)
|
|
|
150 |
spk = params.get("spk", spk)
|
151 |
+
|
152 |
infer_seed = infer_seed or params.get("seed", infer_seed)
|
153 |
temperature = temperature or params.get("temperature", temperature)
|
154 |
prefix = prefix or params.get("prefix", prefix)
|
|
|
161 |
if not disable_normalize:
|
162 |
text = text_normalize(text)
|
163 |
|
164 |
+
sample_rate, audio_data = synthesize_audio(
|
165 |
+
text=text,
|
166 |
+
temperature=temperature,
|
167 |
+
top_P=top_p,
|
168 |
+
top_K=top_k,
|
169 |
+
spk=spk,
|
170 |
+
infer_seed=infer_seed,
|
171 |
+
use_decoder=use_decoder,
|
172 |
+
prompt1=prompt1,
|
173 |
+
prompt2=prompt2,
|
174 |
+
prefix=prefix,
|
175 |
+
batch_size=batch_size,
|
176 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
audio_data = audio.audio_to_int16(audio_data)
|
179 |
return sample_rate, audio_data
|