ovieyra21 commited on
Commit
cd8f9a0
β€’
1 Parent(s): 0b9eb79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -76
app.py CHANGED
@@ -1,4 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
 
2
  import gradio as gr
3
  import yt_dlp as youtube_dl
4
  import numpy as np
@@ -11,9 +53,9 @@ from transformers.pipelines.audio_utils import ffmpeg_read
11
  import tempfile
12
  import os
13
  import time
14
- import demucs
15
 
16
- MODEL_NAME = "openai/whisper-large-v2"
17
  DEMUCS_MODEL_NAME = "htdemucs_ft"
18
  BATCH_SIZE = 8
19
  FILE_LIMIT_MB = 1000
@@ -28,13 +70,14 @@ pipe = pipeline(
28
  device=device,
29
  )
30
 
31
- separator = demucs.api.Separator(model=DEMUCS_MODEL_NAME)
32
 
33
  def separate_vocal(path):
34
  origin, separated = separator.separate_audio_file(path)
35
  demucs.api.save_audio(separated["vocals"], path, samplerate=separator.samplerate)
36
  return path
37
 
 
38
  def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, progress=gr.Progress()):
39
  if inputs_path is None:
40
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
@@ -51,7 +94,7 @@ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAut
51
  current_step += 1
52
  progress((current_step, total_step), desc="Transcribe using Whisper.")
53
 
54
- sampling_rate, inputs = wavfile.read(inputs_path)
55
 
56
  out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
57
 
@@ -64,21 +107,25 @@ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAut
64
  current_step += 1
65
  progress((current_step, total_step), desc="Create dataset.")
66
 
 
67
  transcripts = []
68
  audios = []
69
  with tempfile.TemporaryDirectory() as tmpdirname:
70
- for i, chunk in enumerate(progress.tqdm(chunks, desc="Creating dataset (and clean audio if asked for)")):
 
 
71
  arr = chunk["audio"]
72
  path = os.path.join(tmpdirname, f"{i}.wav")
73
- wavfile.write(path, sampling_rate, arr)
74
 
75
  if use_demucs == "separate-audio":
 
76
  print(f"Separating vocals #{i}")
77
  path = separate_vocal(path)
78
 
79
  audios.append(path)
80
  transcripts.append(chunk["text"])
81
-
82
  dataset = Dataset.from_dict({"audio": audios, "text": transcripts}).cast_column("audio", Audio())
83
 
84
  current_step += 1
@@ -87,6 +134,7 @@ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAut
87
 
88
  return [[transcript] for transcript in transcripts], text
89
 
 
90
  def _return_yt_html_embed(yt_url):
91
  video_id = yt_url.split("?v=")[-1]
92
  HTML_str = (
@@ -126,7 +174,8 @@ def download_yt_audio(yt_url, filename):
126
  except youtube_dl.utils.ExtractorError as err:
127
  raise gr.Error(str(err))
128
 
129
- def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, max_filesize=75.0, dataset_sampling_rate=24000,
 
130
  progress=gr.Progress()):
131
 
132
  if yt_url is None:
@@ -141,7 +190,7 @@ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthT
141
 
142
  if oauth_token is None:
143
  gr.Warning("Make sure to click and login before using this demo.")
144
- return html_embed_str, [["transcripts will appear here"]], ""
145
 
146
  current_step += 1
147
  progress((current_step, total_step), desc="Load video.")
@@ -174,102 +223,133 @@ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthT
174
  transcripts = []
175
  audios = []
176
  with tempfile.TemporaryDirectory() as tmpdirname:
177
- for i, chunk in enumerate(progress.tqdm(chunks, desc="Creating dataset (and clean audio if asked for).")):
 
 
178
  arr = chunk["audio"]
179
  path = os.path.join(tmpdirname, f"{i}.wav")
180
- wavfile.write(path, dataset_sampling_rate, arr)
181
 
182
  if use_demucs == "separate-audio":
 
183
  print(f"Separating vocals #{i}")
184
  path = separate_vocal(path)
185
 
186
  audios.append(path)
187
  transcripts.append(chunk["text"])
188
-
189
  dataset = Dataset.from_dict({"audio": audios, "text": transcripts}).cast_column("audio", Audio())
190
 
191
  current_step += 1
192
  progress((current_step, total_step), desc="Push dataset.")
193
  dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
 
194
 
195
  return html_embed_str, [[transcript] for transcript in transcripts], text
196
 
197
- def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars=".!:;?", min_duration=5):
 
 
 
 
198
  min_duration = int(min_duration * sampling_rate)
199
 
 
200
  new_chunks = []
201
  while chunks:
202
  current_chunk = chunks.pop(0)
 
203
  begin, end = current_chunk["timestamp"]
204
- begin, end = int(begin * sampling_rate), int(end * sampling_rate)
205
- current_dur = end - begin
 
 
206
  text = current_chunk["text"]
207
 
208
- chunk_to_concat = []
209
- while chunks and (current_dur < min_duration or text[-1] not in stop_chars):
210
- next_chunk = chunks.pop(0)
211
- next_text = next_chunk["text"].strip()
212
- next_begin, next_end = next_chunk["timestamp"]
213
- next_begin, next_end = int(next_begin * sampling_rate), int(next_end * sampling_rate)
214
- current_dur += next_end - next_begin
215
- text += f" {next_text}"
216
- end = next_end
217
-
218
- new_chunks.append(
219
- {
220
- "audio": np.array(audio_array[begin:end]).astype(np.float32),
221
- "text": text,
222
- }
223
- )
224
 
 
 
 
 
 
 
225
  return new_chunks
226
-
227
- with gr.Blocks() as demo:
228
- with gr.Row():
229
- with gr.Column():
230
- gr.Markdown("### Audio or YouTube Video Transcription")
231
- with gr.Row():
232
- yt_textbox = gr.Textbox(label="YouTube link")
233
- yt_button = gr.Button("Transcribe YouTube video")
234
- with gr.Column():
235
- gr.Markdown("### Upload or Record Audio")
236
- local_audio_input = gr.Audio(type="filepath", label="Upload Audio")
237
- local_button = gr.Button("Transcribe Local Audio")
238
-
239
- task = gr.Radio(
240
- ["transcribe", "translate"],
241
- label="Task",
242
- value="transcribe",
243
- )
244
-
245
- demucs_checkbox = gr.CheckboxGroup(["separate-audio"], label="Apply Demucs (Separate Vocal from Audio)")
246
- dataset_name = gr.Textbox(label="Dataset name", placeholder="Dataset name to push to Hugging Face Hub")
247
 
248
- with gr.Row():
249
- login_button = gr.Button("Login")
250
- login_output = gr.Markdown()
251
 
252
- with gr.Row():
253
- output_transcriptions = gr.Dataframe(headers=["Transcriptions"])
254
- output_text = gr.Markdown()
255
-
256
- login_button.click(
257
- fn=None,
258
- inputs=None,
259
- outputs=login_output,
260
- _js="function() { return window.location = 'https://huggingface.co/login'; }",
261
- )
262
 
263
- yt_button.click(
264
- yt_transcribe,
265
- inputs=[yt_textbox, task, demucs_checkbox, dataset_name, login_button],
266
- outputs=[login_output, output_transcriptions, output_text],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  )
268
-
269
- local_button.click(
270
- transcribe,
271
- inputs=[local_audio_input, task, demucs_checkbox, dataset_name, login_button],
272
- outputs=[output_transcriptions, output_text],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  )
274
-
275
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hugging Face's logo
2
+ Hugging Face
3
+ Search models, datasets, users...
4
+ Models
5
+ Datasets
6
+ Spaces
7
+ Posts
8
+ Docs
9
+ Pricing
10
+
11
+
12
+
13
+ Spaces:
14
+
15
+ ylacombe
16
+ /
17
+ create-your-own-TTS-dataset
18
+
19
+
20
+ like
21
+ 21
22
+ App
23
+ Files
24
+ Community
25
+ 2
26
+ create-your-own-TTS-dataset
27
+ /
28
+ app.py
29
+
30
+ ylacombe's picture
31
+ ylacombe
32
+ HF STAFF
33
+ Update app.py
34
+ 35d4f1f
35
+ 7 months ago
36
+ raw
37
+ history
38
+ blame
39
+ No virus
40
+
41
+ 13.5 kB
42
  import torch
43
+
44
  import gradio as gr
45
  import yt_dlp as youtube_dl
46
  import numpy as np
 
53
  import tempfile
54
  import os
55
  import time
56
+ import demucs.api
57
 
58
+ MODEL_NAME = "openai/whisper-large-v3"
59
  DEMUCS_MODEL_NAME = "htdemucs_ft"
60
  BATCH_SIZE = 8
61
  FILE_LIMIT_MB = 1000
 
70
  device=device,
71
  )
72
 
73
+ separator = demucs.api.Separator(model = DEMUCS_MODEL_NAME, )
74
 
75
  def separate_vocal(path):
76
  origin, separated = separator.separate_audio_file(path)
77
  demucs.api.save_audio(separated["vocals"], path, samplerate=separator.samplerate)
78
  return path
79
 
80
+
81
  def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, progress=gr.Progress()):
82
  if inputs_path is None:
83
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
 
94
  current_step += 1
95
  progress((current_step, total_step), desc="Transcribe using Whisper.")
96
 
97
+ sampling_rate, inputs = wavfile.read(inputs_path)
98
 
99
  out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
100
 
 
107
  current_step += 1
108
  progress((current_step, total_step), desc="Create dataset.")
109
 
110
+
111
  transcripts = []
112
  audios = []
113
  with tempfile.TemporaryDirectory() as tmpdirname:
114
+ for i,chunk in enumerate(progress.tqdm(chunks, desc="Creating dataset (and clean audio if asked for)")):
115
+
116
+ # TODO: make sure 1D or 2D?
117
  arr = chunk["audio"]
118
  path = os.path.join(tmpdirname, f"{i}.wav")
119
+ wavfile.write(path, sampling_rate, arr)
120
 
121
  if use_demucs == "separate-audio":
122
+ # use demucs tp separate vocals
123
  print(f"Separating vocals #{i}")
124
  path = separate_vocal(path)
125
 
126
  audios.append(path)
127
  transcripts.append(chunk["text"])
128
+
129
  dataset = Dataset.from_dict({"audio": audios, "text": transcripts}).cast_column("audio", Audio())
130
 
131
  current_step += 1
 
134
 
135
  return [[transcript] for transcript in transcripts], text
136
 
137
+
138
  def _return_yt_html_embed(yt_url):
139
  video_id = yt_url.split("?v=")[-1]
140
  HTML_str = (
 
174
  except youtube_dl.utils.ExtractorError as err:
175
  raise gr.Error(str(err))
176
 
177
+
178
+ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, max_filesize=75.0, dataset_sampling_rate = 24000,
179
  progress=gr.Progress()):
180
 
181
  if yt_url is None:
 
190
 
191
  if oauth_token is None:
192
  gr.Warning("Make sure to click and login before using this demo.")
193
+ return html_embed_str, [["transcripts will appear here"]], ""
194
 
195
  current_step += 1
196
  progress((current_step, total_step), desc="Load video.")
 
223
  transcripts = []
224
  audios = []
225
  with tempfile.TemporaryDirectory() as tmpdirname:
226
+ for i,chunk in enumerate(progress.tqdm(chunks, desc="Creating dataset (and clean audio if asked for).")):
227
+
228
+ # TODO: make sure 1D or 2D?
229
  arr = chunk["audio"]
230
  path = os.path.join(tmpdirname, f"{i}.wav")
231
+ wavfile.write(path, dataset_sampling_rate, arr)
232
 
233
  if use_demucs == "separate-audio":
234
+ # use demucs tp separate vocals
235
  print(f"Separating vocals #{i}")
236
  path = separate_vocal(path)
237
 
238
  audios.append(path)
239
  transcripts.append(chunk["text"])
240
+
241
  dataset = Dataset.from_dict({"audio": audios, "text": transcripts}).cast_column("audio", Audio())
242
 
243
  current_step += 1
244
  progress((current_step, total_step), desc="Push dataset.")
245
  dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
246
+
247
 
248
  return html_embed_str, [[transcript] for transcript in transcripts], text
249
 
250
+
251
+ def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars = ".!:;?", min_duration = 5):
252
+ # merge chunks as long as merged audio duration is lower than min_duration and that a stop character is not met
253
+ # return list of dictionnaries (text, audio)
254
+ # min duration is in seconds
255
  min_duration = int(min_duration * sampling_rate)
256
 
257
+
258
  new_chunks = []
259
  while chunks:
260
  current_chunk = chunks.pop(0)
261
+
262
  begin, end = current_chunk["timestamp"]
263
+ begin, end = int(begin*sampling_rate), int(end*sampling_rate)
264
+
265
+ current_dur = end-begin
266
+
267
  text = current_chunk["text"]
268
 
269
+
270
+ chunk_to_concat = [audio_array[begin:end]]
271
+ while chunks and (text[-1] not in stop_chars or (current_dur<min_duration)):
272
+ ch = chunks.pop(0)
273
+ begin, end = ch["timestamp"]
274
+ begin, end = int(begin*sampling_rate), int(end*sampling_rate)
275
+ current_dur += end-begin
276
+
277
+ text = "".join([text, ch["text"]])
278
+
279
+ # TODO: add silence ?
280
+ chunk_to_concat.append(audio_array[begin:end])
281
+
 
 
 
282
 
283
+ new_chunks.append({
284
+ "text": text.strip(),
285
+ "audio": np.concatenate(chunk_to_concat),
286
+ })
287
+ print(f"LENGTH CHUNK #{len(new_chunks)}: {current_dur/sampling_rate}s")
288
+
289
  return new_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
 
 
 
291
 
 
 
 
 
 
 
 
 
 
 
292
 
293
+ css = """
294
+ #intro{
295
+ max-width: 100%;
296
+ text-align: center;
297
+ margin: 0 auto;
298
+ }
299
+ """
300
+ with gr.Blocks(css=css) as demo:
301
+ with gr.Row():
302
+ gr.LoginButton()
303
+ gr.LogoutButton()
304
+
305
+ with gr.Tab("YouTube"):
306
+ gr.Markdown("Create your own TTS dataset using Youtube", elem_id="intro")
307
+ gr.Markdown(
308
+ "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
309
+ f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to automatically transcribe audio files"
310
+ " of arbitrary length. It then merge chunks of audio and push it to the hub."
311
  )
312
+ with gr.Row():
313
+ with gr.Column():
314
+ audio_youtube = gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")
315
+ task_youtube = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
316
+ cleaning_youtube = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio")
317
+ textbox_youtube = gr.Textbox(lines=1, placeholder="Place your new dataset name here. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.", label="Dataset name")
318
+
319
+ with gr.Row():
320
+ clear_youtube = gr.ClearButton([audio_youtube, task_youtube, cleaning_youtube, textbox_youtube])
321
+ submit_youtube = gr.Button("Submit")
322
+
323
+ with gr.Column():
324
+ html_youtube = gr.HTML()
325
+ dataset_youtube = gr.Dataset(label="Transcribed samples.",components=["text"], headers=["Transcripts"], samples=[["transcripts will appear here"]])
326
+ transcript_youtube = gr.Textbox(label="Transcription")
327
+
328
+ with gr.Tab("Microphone or Audio file"):
329
+ gr.Markdown("Create your own TTS dataset using your own recordings", elem_id="intro")
330
+ gr.Markdown(
331
+ "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
332
+ f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to automatically transcribe audio files"
333
+ " of arbitrary length. It then merge chunks of audio and push it to the hub."
334
  )
335
+ with gr.Row():
336
+ with gr.Column():
337
+ audio_file = gr.Audio(type="filepath")
338
+ task_file = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
339
+ cleaning_file = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio")
340
+ textbox_file = gr.Textbox(lines=1, placeholder="Place your new dataset name here. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.", label="Dataset name")
341
+
342
+ with gr.Row():
343
+ clear_file = gr.ClearButton([audio_file, task_file, cleaning_file, textbox_file])
344
+ submit_file = gr.Button("Submit")
345
+
346
+ with gr.Column():
347
+ dataset_file = gr.Dataset(label="Transcribed samples.", components=["text"], headers=["Transcripts"], samples=[["transcripts will appear here"]])
348
+ transcript_file = gr.Textbox(label="Transcription")
349
+
350
+
351
+
352
+ submit_file.click(transcribe, inputs=[audio_file, task_file, cleaning_file, textbox_file], outputs=[dataset_file, transcript_file])
353
+ submit_youtube.click(yt_transcribe, inputs=[audio_youtube, task_youtube, cleaning_youtube, textbox_youtube], outputs=[html_youtube, dataset_youtube, transcript_youtube])
354
+
355
+ demo.launch(debug=True)