espnet / app.py
Gorgefound's picture
Create app.py
07f4993 verified
import gradio as gr
import torch
from espnet2.bin.asr_inference import Speech2Text
from espnet_model_zoo.downloader import ModelDownloader
# Download a pretrained model
d = ModelDownloader()
asr_model = Speech2Text(
**d.download_and_unpack("espnet/simple_asr_train_asr_transformer_e18_raw_bpe_sp_valid.acc.best"),
device="cpu", # Change to "cuda" if using a GPU
)
def transcribe(audio):
"""Transcribe speech to text using ESPnet."""
# Convert audio input (from Gradio) to text
speech = torch.tensor(audio[1]) # Extract the audio waveform
result = asr_model(speech)
text, *_ = result[0] # Get the transcription from the result
return text
# Create a simple Gradio interface
interface = gr.Interface(
fn=transcribe, # Function to call
inputs=gr.Audio(source="microphone", type="numpy"), # Audio input from microphone
outputs="text", # Output type (text transcription)
title="ESPnet ASR Demo", # Title of the UI
description="Simple ESPnet-based speech recognition", # Description of the app
)
# Launch the app
interface.launch()