|
import gradio as gr |
|
import nemo.collections.asr as nemo_asr |
|
from pydub import AudioSegment |
|
import pyaudioconvert as pac |
|
|
|
|
|
hf_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained( |
|
model_name="mbazaNLP/stt_rw_sw_lg_conformer_ctc_large") |
|
|
|
def convert (audio): |
|
file_name = audio.name |
|
if file_name.endswith("mp3") or file_name.endswith("wav") or file_name.endswith("ogg"): |
|
if file_name.endswith("mp3"): |
|
sound = AudioSegment.from_mp3(audio.name) |
|
sound.export(audio.name, format="wav") |
|
elif file_name.endswith("ogg"): |
|
sound = AudioSegment.from_ogg(audio.name) |
|
sound.export(audio.name, format="wav") |
|
else: |
|
return False |
|
pac.convert_wav_to_16bit_mono(audio.name,audio.name) |
|
return True |
|
|
|
def transcribe(audio, audio_microphone): |
|
audio = audio_microphone if audio_microphone else audio |
|
if convert(audio)== False: |
|
return "The format must be mp3,wav and ogg" |
|
result= hf_model.transcribe([audio.name]) |
|
return result[0] |
|
gradio_ui = gr.Interface( |
|
fn=transcribe, |
|
title="East african languages Speech Recognition", |
|
description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing. The supported languages are Kinyarwanda, Swahili and Luganda", |
|
inputs=[gr.inputs.Audio(label="Upload Audio File", type="filepath", optional=True), gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Record from microphone")], |
|
outputs=[gr.outputs.Textbox(label="Recognized speech")] |
|
) |
|
gradio_ui.launch(enable_queue=True) |