mrfakename commited on
Commit
342cd99
1 Parent(s): ad63082

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (1) hide show
  1. app.py +41 -99
app.py CHANGED
@@ -112,101 +112,34 @@ E2TTS_ema_model = load_model(
112
  "E2-TTS", "E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000
113
  )
114
 
115
- def split_text_into_batches(text, max_chars=200, split_words=SPLIT_WORDS):
116
- if len(text.encode('utf-8')) <= max_chars:
117
- return [text]
118
- if text[-1] not in ['。', '.', '!', '!', '?', '?']:
119
- text += '.'
120
-
121
- sentences = re.split('([。.!?!?])', text)
122
- sentences = [''.join(i) for i in zip(sentences[0::2], sentences[1::2])]
123
-
124
- batches = []
125
- current_batch = ""
126
-
127
- def split_by_words(text):
128
- words = text.split()
129
- current_word_part = ""
130
- word_batches = []
131
- for word in words:
132
- if len(current_word_part.encode('utf-8')) + len(word.encode('utf-8')) + 1 <= max_chars:
133
- current_word_part += word + ' '
134
- else:
135
- if current_word_part:
136
- # Try to find a suitable split word
137
- for split_word in split_words:
138
- split_index = current_word_part.rfind(' ' + split_word + ' ')
139
- if split_index != -1:
140
- word_batches.append(current_word_part[:split_index].strip())
141
- current_word_part = current_word_part[split_index:].strip() + ' '
142
- break
143
- else:
144
- # If no suitable split word found, just append the current part
145
- word_batches.append(current_word_part.strip())
146
- current_word_part = ""
147
- current_word_part += word + ' '
148
- if current_word_part:
149
- word_batches.append(current_word_part.strip())
150
- return word_batches
151
 
152
  for sentence in sentences:
153
- if len(current_batch.encode('utf-8')) + len(sentence.encode('utf-8')) <= max_chars:
154
- current_batch += sentence
155
  else:
156
- # If adding this sentence would exceed the limit
157
- if current_batch:
158
- batches.append(current_batch)
159
- current_batch = ""
160
-
161
- # If the sentence itself is longer than max_chars, split it
162
- if len(sentence.encode('utf-8')) > max_chars:
163
- # First, try to split by colon
164
- colon_parts = sentence.split(':')
165
- if len(colon_parts) > 1:
166
- for part in colon_parts:
167
- if len(part.encode('utf-8')) <= max_chars:
168
- batches.append(part)
169
- else:
170
- # If colon part is still too long, split by comma
171
- comma_parts = re.split('[,,]', part)
172
- if len(comma_parts) > 1:
173
- current_comma_part = ""
174
- for comma_part in comma_parts:
175
- if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
176
- current_comma_part += comma_part + ','
177
- else:
178
- if current_comma_part:
179
- batches.append(current_comma_part.rstrip(','))
180
- current_comma_part = comma_part + ','
181
- if current_comma_part:
182
- batches.append(current_comma_part.rstrip(','))
183
- else:
184
- # If no comma, split by words
185
- batches.extend(split_by_words(part))
186
- else:
187
- # If no colon, split by comma
188
- comma_parts = re.split('[,,]', sentence)
189
- if len(comma_parts) > 1:
190
- current_comma_part = ""
191
- for comma_part in comma_parts:
192
- if len(current_comma_part.encode('utf-8')) + len(comma_part.encode('utf-8')) <= max_chars:
193
- current_comma_part += comma_part + ','
194
- else:
195
- if current_comma_part:
196
- batches.append(current_comma_part.rstrip(','))
197
- current_comma_part = comma_part + ','
198
- if current_comma_part:
199
- batches.append(current_comma_part.rstrip(','))
200
- else:
201
- # If no comma, split by words
202
- batches.extend(split_by_words(sentence))
203
- else:
204
- current_batch = sentence
205
-
206
- if current_batch:
207
- batches.append(current_batch)
208
-
209
- return batches
210
 
211
  @gpu_decorator
212
  def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, progress=gr.Progress()):
@@ -306,7 +239,9 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_s
306
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
307
  aseg = AudioSegment.from_file(ref_audio_orig)
308
 
309
- non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)
 
 
310
  non_silent_wave = AudioSegment.silent(duration=0)
311
  for non_silent_seg in non_silent_segs:
312
  non_silent_wave += non_silent_seg
@@ -332,13 +267,20 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_s
332
  else:
333
  gr.Info("Using custom reference text...")
334
 
335
- # Split the input text into batches
 
 
 
 
 
 
336
  audio, sr = torchaudio.load(ref_audio)
337
- max_chars = int(len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (30 - audio.shape[-1] / sr))
338
- gen_text_batches = split_text_into_batches(gen_text, max_chars=max_chars)
 
339
  print('ref_text', ref_text)
340
- for i, gen_text in enumerate(gen_text_batches):
341
- print(f'gen_text {i}', gen_text)
342
 
343
  gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
344
  return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence)
@@ -823,4 +765,4 @@ def main(port, host, share, api):
823
 
824
 
825
  if __name__ == "__main__":
826
- main()
 
112
  "E2-TTS", "E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000
113
  )
114
 
115
+ def chunk_text(text, max_chars=135):
116
+ """
117
+ Splits the input text into chunks, each with a maximum number of characters.
118
+
119
+ Args:
120
+ text (str): The text to be split.
121
+ max_chars (int): The maximum number of characters per chunk.
122
+
123
+ Returns:
124
+ List[str]: A list of text chunks.
125
+ """
126
+ chunks = []
127
+ current_chunk = ""
128
+ # Split the text into sentences based on punctuation followed by whitespace
129
+ sentences = re.split(r'(?<=[;:,.!?])\s+', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  for sentence in sentences:
132
+ if len(current_chunk) + len(sentence) <= max_chars:
133
+ current_chunk += sentence + " "
134
  else:
135
+ if current_chunk:
136
+ chunks.append(current_chunk.strip())
137
+ current_chunk = sentence + " "
138
+
139
+ if current_chunk:
140
+ chunks.append(current_chunk.strip())
141
+
142
+ return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  @gpu_decorator
145
  def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, progress=gr.Progress()):
 
239
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
240
  aseg = AudioSegment.from_file(ref_audio_orig)
241
 
242
+ non_silent_segs = silence.split_on_silence(
243
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500
244
+ )
245
  non_silent_wave = AudioSegment.silent(duration=0)
246
  for non_silent_seg in non_silent_segs:
247
  non_silent_wave += non_silent_seg
 
267
  else:
268
  gr.Info("Using custom reference text...")
269
 
270
+ # Add the functionality to ensure it ends with ". "
271
+ if not ref_text.endswith(". "):
272
+ if ref_text.endswith("."):
273
+ ref_text += " "
274
+ else:
275
+ ref_text += ". "
276
+
277
  audio, sr = torchaudio.load(ref_audio)
278
+
279
+ # Use the new chunk_text function to split gen_text
280
+ gen_text_batches = chunk_text(gen_text, max_chars=135)
281
  print('ref_text', ref_text)
282
+ for i, batch_text in enumerate(gen_text_batches):
283
+ print(f'gen_text {i}', batch_text)
284
 
285
  gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
286
  return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence)
 
765
 
766
 
767
  if __name__ == "__main__":
768
+ main()