import gradio as gr
import torch
import nemo.collections.asr as nemo_asr
SAMPLE_RATE = 16000
TITLE = "NeMo ASR Inference on Hugging Face"
DESCRIPTION = "Demo of all languages supported by NeMo ASR"
DEFAULT_EN_MODEL = "nvidia/stt_en_conformer_transducer_xlarge"
MARKDOWN = f"""
# {TITLE}
## {DESCRIPTION}
"""
CSS = """
p.big {
font-size: 20px;
}
"""
ARTICLE = """
NeMo ASR
|
Github Repo
"""
SUPPORTED_LANGUAGES = set([])
SUPPORTED_MODEL_NAMES = set([])
# HF models
hf_filter = nemo_asr.models.ASRModel.get_hf_model_filter()
hf_filter.task = "automatic-speech-recognition"
hf_infos = nemo_asr.models.ASRModel.search_huggingface_models(model_filter=hf_filter)
for info in hf_infos:
lang_id = info.modelId.split("_")[1] # obtains lang id as str
SUPPORTED_LANGUAGES.add(lang_id)
SUPPORTED_MODEL_NAMES.add(info.modelId)
SUPPORTED_MODEL_NAMES = sorted(list(SUPPORTED_MODEL_NAMES))
model_dict = {model_name: gr.Interface.load(f'models/{model_name}') for model_name in SUPPORTED_MODEL_NAMES}
SUPPORTED_LANG_MODEL_DICT = {}
for lang in SUPPORTED_LANGUAGES:
for model_id in SUPPORTED_MODEL_NAMES:
if ("_" + lang + "_") in model_id:
# create new lang in dict
if lang not in SUPPORTED_LANG_MODEL_DICT:
SUPPORTED_LANG_MODEL_DICT[lang] = [model_id]
else:
SUPPORTED_LANG_MODEL_DICT[lang].append(model_id)
# Sort model names
for lang in SUPPORTED_LANG_MODEL_DICT.keys():
model_ids = SUPPORTED_LANG_MODEL_DICT[lang]
model_ids = sorted(model_ids)
SUPPORTED_LANG_MODEL_DICT[lang] = model_ids
def transcribe(microphone, audio_file, model_name):
model = model_dict[model_name]
warn_output = ""
if (microphone is not None) and (audio_file is not None):
warn_output = (
"WARNING: You've uploaded an audio file and used the microphone. "
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
)
audio_data = microphone
elif (microphone is None) and (audio_file is None):
return "ERROR: You have to either use the microphone or upload an audio file"
elif microphone is not None:
audio_data = microphone
else:
audio_data = audio_file
try:
# Use HF API for transcription
transcriptions = model(audio_data)
except Exception as e:
transcriptions = ""
warn_output = warn_output + "\n\n"
warn_output += (
f"The model `{model_name}` is currently loading and cannot be used "
f"for transcription.\n"
f"Please try another model or wait a few minutes."
)
return warn_output + transcriptions
demo = gr.Blocks(title=TITLE, css=CSS)
with demo:
header = gr.Markdown(MARKDOWN)
with gr.Row() as row:
file_upload = gr.components.Audio(source="upload", type='filepath', label='Upload File')
microphone = gr.components.Audio(source="microphone", type='filepath', label='Microphone')
lang_selector = gr.components.Dropdown(
choices=sorted(list(SUPPORTED_LANGUAGES)), value="en", type="value", label="Languages", interactive=True,
)
models_in_lang = gr.components.Dropdown(
choices=sorted(list(SUPPORTED_LANG_MODEL_DICT["en"])),
value=DEFAULT_EN_MODEL,
label="Models",
interactive=True,
)
def update_models_with_lang(lang):
models_names = sorted(list(SUPPORTED_LANG_MODEL_DICT[lang]))
default = models_names[0]
if lang == 'en':
default = DEFAULT_EN_MODEL
return models_in_lang.update(choices=models_names, value=default)
lang_selector.change(update_models_with_lang, inputs=[lang_selector], outputs=[models_in_lang])
transcript = gr.components.Label(label='Transcript')
run = gr.components.Button('Transcribe')
run.click(transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript])
gr.components.HTML(ARTICLE)
demo.queue(concurrency_count=1)
demo.launch()