import gradio as gr import torch from espnet2.bin.asr_inference import Speech2Text from espnet_model_zoo.downloader import ModelDownloader # Download a pretrained model d = ModelDownloader() asr_model = Speech2Text( **d.download_and_unpack("espnet/simple_asr_train_asr_transformer_e18_raw_bpe_sp_valid.acc.best"), device="cpu", # Change to "cuda" if using a GPU ) def transcribe(audio): """Transcribe speech to text using ESPnet.""" # Convert audio input (from Gradio) to text speech = torch.tensor(audio[1]) # Extract the audio waveform result = asr_model(speech) text, *_ = result[0] # Get the transcription from the result return text # Create a simple Gradio interface interface = gr.Interface( fn=transcribe, # Function to call inputs=gr.Audio(source="microphone", type="numpy"), # Audio input from microphone outputs="text", # Output type (text transcription) title="ESPnet ASR Demo", # Title of the UI description="Simple ESPnet-based speech recognition", # Description of the app ) # Launch the app interface.launch()