File size: 1,130 Bytes
07f4993
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import gradio as gr
import torch
from espnet2.bin.asr_inference import Speech2Text
from espnet_model_zoo.downloader import ModelDownloader

# Download a pretrained model
d = ModelDownloader()
asr_model = Speech2Text(
    **d.download_and_unpack("espnet/simple_asr_train_asr_transformer_e18_raw_bpe_sp_valid.acc.best"),
    device="cpu",  # Change to "cuda" if using a GPU
)

def transcribe(audio):
    """Transcribe speech to text using ESPnet."""
    # Convert audio input (from Gradio) to text
    speech = torch.tensor(audio[1])  # Extract the audio waveform
    result = asr_model(speech)
    text, *_ = result[0]  # Get the transcription from the result
    return text

# Create a simple Gradio interface
interface = gr.Interface(
    fn=transcribe,                   # Function to call
    inputs=gr.Audio(source="microphone", type="numpy"),  # Audio input from microphone
    outputs="text",                  # Output type (text transcription)
    title="ESPnet ASR Demo",         # Title of the UI
    description="Simple ESPnet-based speech recognition",  # Description of the app
)

# Launch the app
interface.launch()