|
import gradio as gr |
|
import torch |
|
from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel |
|
import soundfile as sf |
|
|
|
|
|
model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de") |
|
processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de") |
|
|
|
|
|
def transcribe_speech(file_info): |
|
|
|
speech, _ = sf.read(file_info) |
|
|
|
inputs = processor(speech, sampling_rate=16_000, return_tensors="pt") |
|
|
|
generated_ids = model.generate(inputs=inputs["input_values"], attention_mask=inputs["attention_mask"]) |
|
|
|
transcription = processor.batch_decode(generated_ids) |
|
|
|
return transcription[0] |
|
|
|
|
|
iface = gr.Interface( |
|
fn=transcribe_speech, |
|
inputs=gr.inputs.Audio(source="upload", type="filepath", label="Upload your MP3 file"), |
|
outputs="text", |
|
title="Speech to Text Conversion", |
|
description="Upload an MP3 file to transcribe it to text using a state-of-the-art speech-to-text model." |
|
) |
|
|
|
|
|
iface.launch() |
|
|