Spaces:
Runtime error
Runtime error
Fix issue with single uploaded file not being transcribed if too long
Browse files
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🐠
|
|
4 |
colorFrom: blue
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.12
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
import os
|
2 |
import json
|
|
|
3 |
import uuid
|
4 |
import tempfile
|
5 |
import subprocess
|
6 |
import re
|
7 |
import time
|
|
|
8 |
|
9 |
import gradio as gr
|
10 |
import pytube as pt
|
@@ -24,6 +26,8 @@ os.environ[constants.NEMO_ENV_CACHE_DIR] = "/tmp/nemo/"
|
|
24 |
|
25 |
SAMPLE_RATE = 16000 # Default sample rate for ASR
|
26 |
BUFFERED_INFERENCE_DURATION_THRESHOLD = 60.0 # 60 second and above will require chunked inference.
|
|
|
|
|
27 |
|
28 |
TITLE = "NeMo ASR Inference on Hugging Face"
|
29 |
DESCRIPTION = "Demo of all languages supported by NeMo ASR"
|
@@ -184,11 +188,14 @@ def convert_audio(audio_filepath):
|
|
184 |
return audio_filepath
|
185 |
|
186 |
out_filename = os.path.join(filedir, filename + '.wav')
|
|
|
187 |
process = subprocess.Popen(
|
188 |
-
['ffmpeg', '-i', audio_filepath, '-ac', '1', '-ar', str(SAMPLE_RATE), out_filename],
|
189 |
stdout=subprocess.PIPE,
|
190 |
stderr=subprocess.STDOUT,
|
|
|
191 |
)
|
|
|
192 |
stdout, stderr = process.communicate()
|
193 |
|
194 |
if os.path.exists(out_filename):
|
@@ -368,6 +375,7 @@ def infer_audio(model_name: str, audio_file: str) -> str:
|
|
368 |
|
369 |
def transcribe(microphone, audio_file, model_name):
|
370 |
|
|
|
371 |
warn_output = ""
|
372 |
if (microphone is not None) and (audio_file is not None):
|
373 |
warn_output = (
|
@@ -384,15 +392,32 @@ def transcribe(microphone, audio_file, model_name):
|
|
384 |
else:
|
385 |
audio_data = audio_file
|
386 |
|
|
|
|
|
|
|
|
|
|
|
387 |
time_diff = None
|
388 |
try:
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
|
395 |
except Exception as e:
|
|
|
|
|
396 |
transcriptions = ""
|
397 |
warn_output = warn_output
|
398 |
|
@@ -412,8 +437,6 @@ def transcribe(microphone, audio_file, model_name):
|
|
412 |
if transcriptions.startswith("Error:-"):
|
413 |
html_output = build_html_output(transcriptions, style="result_item_error")
|
414 |
else:
|
415 |
-
audio_duration = parse_duration(audio_data)
|
416 |
-
|
417 |
output = f"Successfully transcribed on {get_device()} ! <br>" f"Transcription Time : {time_diff: 0.3f} s"
|
418 |
|
419 |
if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
|
@@ -533,10 +556,11 @@ with demo:
|
|
533 |
|
534 |
lang_selector, models_in_lang = create_lang_selector_component()
|
535 |
|
|
|
|
|
536 |
transcript = gr.components.Label(label='Transcript')
|
537 |
audio_html_output = gr.components.HTML()
|
538 |
|
539 |
-
run = gr.components.Button('Transcribe')
|
540 |
run.click(
|
541 |
transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript, audio_html_output]
|
542 |
)
|
|
|
1 |
import os
|
2 |
import json
|
3 |
+
import shutil
|
4 |
import uuid
|
5 |
import tempfile
|
6 |
import subprocess
|
7 |
import re
|
8 |
import time
|
9 |
+
import traceback
|
10 |
|
11 |
import gradio as gr
|
12 |
import pytube as pt
|
|
|
26 |
|
27 |
SAMPLE_RATE = 16000 # Default sample rate for ASR
|
28 |
BUFFERED_INFERENCE_DURATION_THRESHOLD = 60.0 # 60 second and above will require chunked inference.
|
29 |
+
CHUNK_LEN_IN_SEC = 20.0 # Chunk size
|
30 |
+
BUFFER_LEN_IN_SEC = 30.0 # Total buffer size
|
31 |
|
32 |
TITLE = "NeMo ASR Inference on Hugging Face"
|
33 |
DESCRIPTION = "Demo of all languages supported by NeMo ASR"
|
|
|
188 |
return audio_filepath
|
189 |
|
190 |
out_filename = os.path.join(filedir, filename + '.wav')
|
191 |
+
|
192 |
process = subprocess.Popen(
|
193 |
+
['ffmpeg', '-y', '-i', audio_filepath, '-ac', '1', '-ar', str(SAMPLE_RATE), out_filename],
|
194 |
stdout=subprocess.PIPE,
|
195 |
stderr=subprocess.STDOUT,
|
196 |
+
close_fds=True,
|
197 |
)
|
198 |
+
|
199 |
stdout, stderr = process.communicate()
|
200 |
|
201 |
if os.path.exists(out_filename):
|
|
|
375 |
|
376 |
def transcribe(microphone, audio_file, model_name):
|
377 |
|
378 |
+
audio_data = None
|
379 |
warn_output = ""
|
380 |
if (microphone is not None) and (audio_file is not None):
|
381 |
warn_output = (
|
|
|
392 |
else:
|
393 |
audio_data = audio_file
|
394 |
|
395 |
+
if audio_data is not None:
|
396 |
+
audio_duration = parse_duration(audio_data)
|
397 |
+
else:
|
398 |
+
audio_duration = None
|
399 |
+
|
400 |
time_diff = None
|
401 |
try:
|
402 |
+
with tempfile.TemporaryDirectory() as tempdir:
|
403 |
+
filename = os.path.split(audio_data)[-1]
|
404 |
+
new_audio_data = os.path.join(tempdir, filename)
|
405 |
+
shutil.copy2(audio_data, new_audio_data)
|
406 |
+
|
407 |
+
if os.path.exists(audio_data):
|
408 |
+
os.remove(audio_data)
|
409 |
+
|
410 |
+
audio_data = new_audio_data`
|
411 |
+
|
412 |
+
# Use HF API for transcription
|
413 |
+
start = time.time()
|
414 |
+
transcriptions = infer_audio(model_name, audio_data)
|
415 |
+
end = time.time()
|
416 |
+
time_diff = end - start
|
417 |
|
418 |
except Exception as e:
|
419 |
+
print(traceback.print_exc())
|
420 |
+
|
421 |
transcriptions = ""
|
422 |
warn_output = warn_output
|
423 |
|
|
|
437 |
if transcriptions.startswith("Error:-"):
|
438 |
html_output = build_html_output(transcriptions, style="result_item_error")
|
439 |
else:
|
|
|
|
|
440 |
output = f"Successfully transcribed on {get_device()} ! <br>" f"Transcription Time : {time_diff: 0.3f} s"
|
441 |
|
442 |
if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
|
|
|
556 |
|
557 |
lang_selector, models_in_lang = create_lang_selector_component()
|
558 |
|
559 |
+
run = gr.components.Button('Transcribe')
|
560 |
+
|
561 |
transcript = gr.components.Label(label='Transcript')
|
562 |
audio_html_output = gr.components.HTML()
|
563 |
|
|
|
564 |
run.click(
|
565 |
transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript, audio_html_output]
|
566 |
)
|