zhzluke96 commited on
Commit
d5b3cd8
1 Parent(s): 84cfd61
modules/SynthesizeSegments.py CHANGED
@@ -9,6 +9,7 @@ from modules.normalization import text_normalize
9
  import logging
10
  import json
11
  import random
 
12
 
13
  from modules.speaker import Speaker
14
 
@@ -61,6 +62,9 @@ class SynthesizeSegments:
61
  self.batch_size = batch_size
62
 
63
  def segment_to_generate_params(self, segment: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
64
  text = segment.get("text", "")
65
  is_end = segment.get("is_end", False)
66
 
@@ -111,19 +115,15 @@ class SynthesizeSegments:
111
  for segment in segments:
112
  params = self.segment_to_generate_params(segment)
113
 
114
- key_params = params
115
  if isinstance(key_params.get("spk"), Speaker):
116
  key_params["spk"] = str(key_params["spk"].id)
117
  key = json.dumps(
118
  {k: v for k, v in key_params.items() if k != "text"}, sort_keys=True
119
  )
120
- if params["spk"] == -1 or params["infer_seed"] == -1:
121
- key = random.random()
122
- buckets[key] = [segment]
123
- else:
124
- if key not in buckets:
125
- buckets[key] = []
126
- buckets[key].append(segment)
127
 
128
  # Convert dictionary to list of buckets
129
  bucket_list = list(buckets.values())
 
9
  import logging
10
  import json
11
  import random
12
+ import copy
13
 
14
  from modules.speaker import Speaker
15
 
 
62
  self.batch_size = batch_size
63
 
64
  def segment_to_generate_params(self, segment: Dict[str, Any]) -> Dict[str, Any]:
65
+ if segment.get("params", None) is not None:
66
+ return segment["params"]
67
+
68
  text = segment.get("text", "")
69
  is_end = segment.get("is_end", False)
70
 
 
115
  for segment in segments:
116
  params = self.segment_to_generate_params(segment)
117
 
118
+ key_params = copy.copy(params)
119
  if isinstance(key_params.get("spk"), Speaker):
120
  key_params["spk"] = str(key_params["spk"].id)
121
  key = json.dumps(
122
  {k: v for k, v in key_params.items() if k != "text"}, sort_keys=True
123
  )
124
+ if key not in buckets:
125
+ buckets[key] = []
126
+ buckets[key].append(segment)
 
 
 
 
127
 
128
  # Convert dictionary to list of buckets
129
  bucket_list = list(buckets.values())
modules/api/impl/ssml_api.py CHANGED
@@ -23,7 +23,7 @@ from modules.api.Api import APIManager
23
  class SSMLRequest(BaseModel):
24
  ssml: str
25
  format: str = "mp3"
26
- batch: bool = False
27
 
28
 
29
  async def synthesize_ssml(
@@ -34,7 +34,12 @@ async def synthesize_ssml(
34
  try:
35
  ssml = request.ssml
36
  format = request.format
37
- batch = request.batch
 
 
 
 
 
38
 
39
  if not ssml:
40
  raise HTTPException(status_code=400, detail="SSML content is required.")
@@ -43,8 +48,8 @@ async def synthesize_ssml(
43
  for seg in segments:
44
  seg["text"] = text_normalize(seg["text"], is_end=True)
45
 
46
- if batch:
47
- synthesize = SynthesizeSegments(16)
48
  audio_segments = synthesize.synthesize_segments(segments)
49
  combined_audio = combine_audio_segments(audio_segments)
50
  buffer = io.BytesIO()
 
23
  class SSMLRequest(BaseModel):
24
  ssml: str
25
  format: str = "mp3"
26
+ batch_size: int = 4
27
 
28
 
29
  async def synthesize_ssml(
 
34
  try:
35
  ssml = request.ssml
36
  format = request.format
37
+ batch_size = request.batch_size
38
+
39
+ if batch_size < 1:
40
+ raise HTTPException(
41
+ status_code=400, detail="Batch size must be greater than 0."
42
+ )
43
 
44
  if not ssml:
45
  raise HTTPException(status_code=400, detail="SSML content is required.")
 
48
  for seg in segments:
49
  seg["text"] = text_normalize(seg["text"], is_end=True)
50
 
51
+ if batch_size != 1:
52
+ synthesize = SynthesizeSegments(batch_size)
53
  audio_segments = synthesize.synthesize_segments(segments)
54
  combined_audio = combine_audio_segments(audio_segments)
55
  buffer = io.BytesIO()
modules/synthesize_audio.py CHANGED
@@ -1,12 +1,12 @@
 
1
  from modules.SentenceSplitter import SentenceSplitter
2
- from modules.normalization import text_normalize
3
 
4
  from modules import generate_audio as generate
5
 
6
 
7
- import numpy as np
8
-
9
  from modules.speaker import Speaker
 
10
 
11
 
12
  def synthesize_audio(
@@ -39,20 +39,28 @@ def synthesize_audio(
39
  else:
40
  spliter = SentenceSplitter(spliter_threshold)
41
  sentences = spliter.parse(text)
42
- sentences = [text_normalize(s) for s in sentences]
43
- audio_data_batch = generate.generate_audio_batch(
44
- texts=sentences,
45
- temperature=temperature,
46
- top_P=top_P,
47
- top_K=top_K,
48
- spk=spk,
49
- infer_seed=infer_seed,
50
- use_decoder=use_decoder,
51
- prompt1=prompt1,
52
- prompt2=prompt2,
53
- prefix=prefix,
54
- )
55
- sample_rate = audio_data_batch[0][0]
56
- audio_data = np.concatenate([data for _, data in audio_data_batch])
57
 
58
- return sample_rate, audio_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
  from modules.SentenceSplitter import SentenceSplitter
3
+ from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments
4
 
5
  from modules import generate_audio as generate
6
 
7
 
 
 
8
  from modules.speaker import Speaker
9
+ from modules.utils import audio
10
 
11
 
12
  def synthesize_audio(
 
39
  else:
40
  spliter = SentenceSplitter(spliter_threshold)
41
  sentences = spliter.parse(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ text_segments = [
44
+ {
45
+ "text": s,
46
+ "params": {
47
+ "text": s,
48
+ "temperature": temperature,
49
+ "top_P": top_P,
50
+ "top_K": top_K,
51
+ "spk": spk,
52
+ "infer_seed": infer_seed,
53
+ "use_decoder": use_decoder,
54
+ "prompt1": prompt1,
55
+ "prompt2": prompt2,
56
+ "prefix": prefix,
57
+ },
58
+ }
59
+ for s in sentences
60
+ ]
61
+ synthesizer = SynthesizeSegments(batch_size)
62
+ audio_segments = synthesizer.synthesize_segments(text_segments)
63
+
64
+ combined_audio = combine_audio_segments(audio_segments)
65
+
66
+ return audio.pydub_to_np(combined_audio)
modules/utils/audio.py CHANGED
@@ -9,9 +9,12 @@ INT16_MAX = np.iinfo(np.int16).max
9
 
10
 
11
  def audio_to_int16(audio_data):
12
- if audio_data.dtype == np.float32:
13
- audio_data = (audio_data * INT16_MAX).astype(np.int16)
14
- if audio_data.dtype == np.float16:
 
 
 
15
  audio_data = (audio_data * INT16_MAX).astype(np.int16)
16
  return audio_data
17
 
@@ -27,6 +30,21 @@ def audiosegment_to_librosawav(audiosegment):
27
  return fp_arr
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def ndarray_to_segment(ndarray, frame_rate):
31
  buffer = BytesIO()
32
  sf.write(buffer, ndarray, frame_rate, format="wav")
 
9
 
10
 
11
  def audio_to_int16(audio_data):
12
+ if (
13
+ audio_data.dtype == np.float32
14
+ or audio_data.dtype == np.float64
15
+ or audio_data.dtype == np.float128
16
+ or audio_data.dtype == np.float16
17
+ ):
18
  audio_data = (audio_data * INT16_MAX).astype(np.int16)
19
  return audio_data
20
 
 
30
  return fp_arr
31
 
32
 
33
+ def pydub_to_np(audio: AudioSegment) -> tuple[int, np.ndarray]:
34
+ """
35
+ Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
36
+ where each value is in range [-1.0, 1.0].
37
+ Returns tuple (audio_np_array, sample_rate).
38
+ """
39
+ return (
40
+ audio.frame_rate,
41
+ np.array(audio.get_array_of_samples(), dtype=np.float32).reshape(
42
+ (-1, audio.channels)
43
+ )
44
+ / (1 << (8 * audio.sample_width - 1)),
45
+ )
46
+
47
+
48
  def ndarray_to_segment(ndarray, frame_rate):
49
  buffer = BytesIO()
50
  sf.write(buffer, ndarray, frame_rate, format="wav")
webui.py CHANGED
@@ -16,6 +16,8 @@ import logging
16
 
17
  from numpy import clip
18
 
 
 
19
  logging.basicConfig(
20
  level=os.getenv("LOG_LEVEL", "INFO"),
21
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -145,8 +147,8 @@ def tts_generate(
145
  top_k = int(top_k)
146
 
147
  params = calc_spk_style(spk=spk, style=style)
148
-
149
  spk = params.get("spk", spk)
 
150
  infer_seed = infer_seed or params.get("seed", infer_seed)
151
  temperature = temperature or params.get("temperature", temperature)
152
  prefix = prefix or params.get("prefix", prefix)
@@ -159,37 +161,19 @@ def tts_generate(
159
  if not disable_normalize:
160
  text = text_normalize(text)
161
 
162
- if batch_size == 1:
163
- sample_rate, audio_data = generate_audio(
164
- text=text,
165
- temperature=temperature,
166
- top_P=top_p,
167
- top_K=top_k,
168
- spk=spk,
169
- infer_seed=infer_seed,
170
- use_decoder=use_decoder,
171
- prompt1=prompt1,
172
- prompt2=prompt2,
173
- prefix=prefix,
174
- )
175
- else:
176
- spliter = SentenceSplitter(webui_config["spliter_threshold"])
177
- sentences = spliter.parse(text)
178
- sentences = [text_normalize(s) for s in sentences]
179
- audio_data_batch = generate_audio_batch(
180
- texts=sentences,
181
- temperature=temperature,
182
- top_P=top_p,
183
- top_K=top_k,
184
- spk=spk,
185
- infer_seed=infer_seed,
186
- use_decoder=use_decoder,
187
- prompt1=prompt1,
188
- prompt2=prompt2,
189
- prefix=prefix,
190
- )
191
- sample_rate = audio_data_batch[0][0]
192
- audio_data = np.concatenate([data for _, data in audio_data_batch])
193
 
194
  audio_data = audio.audio_to_int16(audio_data)
195
  return sample_rate, audio_data
 
16
 
17
  from numpy import clip
18
 
19
+ from modules.synthesize_audio import synthesize_audio
20
+
21
  logging.basicConfig(
22
  level=os.getenv("LOG_LEVEL", "INFO"),
23
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 
147
  top_k = int(top_k)
148
 
149
  params = calc_spk_style(spk=spk, style=style)
 
150
  spk = params.get("spk", spk)
151
+
152
  infer_seed = infer_seed or params.get("seed", infer_seed)
153
  temperature = temperature or params.get("temperature", temperature)
154
  prefix = prefix or params.get("prefix", prefix)
 
161
  if not disable_normalize:
162
  text = text_normalize(text)
163
 
164
+ sample_rate, audio_data = synthesize_audio(
165
+ text=text,
166
+ temperature=temperature,
167
+ top_P=top_p,
168
+ top_K=top_k,
169
+ spk=spk,
170
+ infer_seed=infer_seed,
171
+ use_decoder=use_decoder,
172
+ prompt1=prompt1,
173
+ prompt2=prompt2,
174
+ prefix=prefix,
175
+ batch_size=batch_size,
176
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  audio_data = audio.audio_to_int16(audio_data)
179
  return sample_rate, audio_data